Add SDK support for vllm moe

ilyasher · ilyasher · commit 5921c784f108 · 2025-11-20T20:10:07.000-08:00
diff --git a/collector/vllm/collect_moe.py b/collector/vllm/collect_moe.py
@@ -106,7 +106,6 @@ def power_law_logits_v3(num_tokens, num_experts, topk, ep, alpha, return_first_g
         )
         num_tokens_per_expert = num_tokens_per_expert_reshaped.view(-1)
 
-
     revised_num_tokens = num_tokens
     revised_topk = topk
     if return_first_gpu_only:
@@ -144,7 +143,6 @@ def power_law_logits_v3(num_tokens, num_experts, topk, ep, alpha, return_first_g
     expert_assignments = torch.tensor(expert_assignments, dtype=torch.long)
     h_selected_experts = expert_assignments.reshape(revised_topk, revised_num_tokens).T
 
-
     expert_map = F.one_hot(h_selected_experts.long(), num_classes=num_experts).sum(1)
     router_logits = F.softmax(expert_map.half(), dim=1)
     return router_logits
@@ -188,7 +186,7 @@ def get_moe_test_cases():
     ep_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
     num_gpu_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
     alpha_list = [1.01, 1.2]
-    
+
     # Model configurations: [hidden_size, inter_size, topk, num_experts, model_name]
     model_config_list = [
         [4096, 14336, 2, 8, "MOE_Mixtral8x7B"],  # mixtral_8x7b
@@ -204,7 +202,7 @@ def get_moe_test_cases():
     moe_list = ["float16"]
 
     if get_sm_version() > 86:
-        moe_list += ["fp8",]
+        moe_list += ["fp8"]
 
     test_cases = []
 
@@ -251,6 +249,7 @@ def get_moe_test_cases():
 
     return test_cases
 
+
 def run_moe_torch(
     moe_type,
     num_tokens_lists,
@@ -301,18 +300,18 @@ def run_moe_torch(
     # w1: gate + up projection weights [num_experts, 2 * inter_size, hidden_size]
     # w2: down projection weights [num_experts, hidden_size, inter_size]
     w1 = torch.randn(
-        local_num_experts, 
-        2 * local_inter_size, 
-        hidden_size, 
-        dtype=torch.float16, 
-        device=device
+        local_num_experts,
+        2 * local_inter_size,
+        hidden_size,
+        dtype=torch.float16,
+        device=device,
     )
     w2 = torch.randn(
-        local_num_experts, 
-        hidden_size, 
-        local_inter_size, 
-        dtype=torch.float16, 
-        device=device
+        local_num_experts,
+        hidden_size,
+        local_inter_size,
+        dtype=torch.float16,
+        device=device,
     )
 
     if dtype == torch.float8_e4m3fn:
@@ -332,14 +331,13 @@ def run_moe_torch(
             topk_ids_list = []
 
             for _ in range(num_iter):
-                logits = power_law_logits_v3(
-                    num_tokens,
-                    num_experts,
-                    topk,
-                    moe_ep_size,
-                    power_law_alpha,
-                    return_first_gpu_only=True
-                ).half().to(device)
+                logits = (
+                    power_law_logits_v3(
+                        num_tokens, num_experts, topk, moe_ep_size, power_law_alpha, return_first_gpu_only=True
+                    )
+                    .half()
+                    .to(device)
+                )
                 weights, ids = torch.topk(logits, local_topk, dim=-1)
                 topk_weights_list.append(F.softmax(weights, dim=-1))
                 topk_ids_list.append(ids)
@@ -356,7 +354,6 @@ def run_moe_torch(
         else:
             raise ValueError(f"Unsupported distributed mode: {distributed}")
 
-
         num_warmups = 3
         num_runs = 6
         if distributed == "power_law":
@@ -418,11 +415,11 @@ def run_iterations(use_cuda_graph=False):
 
         try:
             latency = run_iterations(use_cuda_graph=False)
-        except torch.OutOfMemoryError as e:
+        except torch.OutOfMemoryError:
             # If OOM, check if we had at least one successful run.
             if num_tokens_idx > 0:
                 break
-            raise e
+            raise
 
         print(f"moe latency: {latency}")
 
@@ -454,24 +451,13 @@ def run_iterations(use_cuda_graph=False):
 
 if __name__ == "__main__":
     test_cases = get_moe_test_cases()
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 4096, 14336, 2, 8, 1, 2, 'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.2]]
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 4096, 14336, 2, 8, 1, 2, 'MOE_Mixtral8x7B', 'moe_perf.txt', 'balanced', 1.01]]
-    # test_cases = [['float16', [128, 256, 320], 4096, 14336, 2, 8, 1, 2, 'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.01]]
-    test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 4096, 14336, 2, 8, 1, 2, 'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.01]]
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 6144, 2560, 8, 160, 1, 1, 'QWEN3_480B', 'moe_perf.txt', 'power_law', 1.2]]
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 7168, 2048, 8, 384, 1, 1, 'KIMI_K2', 'moe_perf.txt', 'power_law', 1.01]]
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 7168, 2048, 8, 384, 1, 4, 'KIMI_K2', 'moe_perf.txt', 'power_law', 1.01]]
-    # test_cases = [['float16', [1, 2, 4, 8, 16, 32, 48, 64, 80, 96, 128, 160, 192, 256, 320, 384, 512, 768, 1024, 1536, 2048, 3072, 4096, 6144, 8192, 12288, 16384, 20480, 32768, 65536], 2048, 768, 8, 128, 2, 32, 'QWEN3_30B_A3B', 'moe_perf.txt', 'power_law', 1.01]]
-
-    test_cases = [['float16',[65536],4096,14336,2,8,1,1,'MOE_Mixtral8x7B', 'moe_perf.txt', 'power_law', 1.01]]
-
     print(f"Total test cases: {len(test_cases)}")
-    
-    for test_case in test_cases[:40]:
+
+    for test_case in test_cases:
         print(f"Running test case: {test_case}")
         try:
             run_moe_torch(*test_case)
         except Exception as e:
             print(f"Test case failed: {test_case}")
             print(f"Error: {e}")
-            continue
+            continue
diff --git a/pyproject.toml b/pyproject.toml
@@ -146,6 +146,7 @@ ignore = [
     "RUF059",  # unpacked variable is never used
     "UP007",  # require using X | Y for type annotations
     "UP045", # require using X | None for type annotations
+    "B023", # Function definition does not bind loop variable
 ]
 
 [tool.ruff.lint.isort]
diff --git a/src/aiconfigurator/sdk/operations.py b/src/aiconfigurator/sdk/operations.py
@@ -348,8 +348,22 @@ def query(self, database: PerfDatabase, **kwargs):
                     else:
                         comm_latency = 0
         elif database.backend == common.BackendName.vllm.value:
-            raise NotImplementedError("Need to implement MoE dispatch for vllm")
-        else:  # sglang
+            assert self._moe_tp_size == 1, "vllm does not support moe_tp_size > 1"
+
+            comm_latency = 0
+
+            # Add allreduce latency when TP > 1
+            if self._attention_tp_size > 1:
+                comm_latency += database.query_allreduce(common.CommQuantMode.half, self.num_gpus, volume)
+
+            if self._attention_dp_size > 1:
+                comm_latency += database.query_nccl(
+                    common.CommQuantMode.half,
+                    self.num_gpus,
+                    "all_gather" if self._pre_dispatch else "reduce_scatter",
+                    volume * self._attention_dp_size,
+                )
+        elif database.backend == common.BackendName.sglang.value:
             if self._moe_backend == "deepep_moe":
                 if self._is_context:
                     comm_latency = database.query_deepep_normal(
@@ -370,6 +384,8 @@ def query(self, database: PerfDatabase, **kwargs):
                     )
             else:
                 raise NotImplementedError(f"MoE backend {self._moe_backend} not implemented")
+        else:
+            raise NotImplementedError(f"Backend {database.backend} not implemented")
         return comm_latency * self._scale_factor
 
     def get_weights(self, **kwargs):
diff --git a/src/aiconfigurator/sdk/perf_database.py b/src/aiconfigurator/sdk/perf_database.py
@@ -1121,6 +1121,8 @@ def __init__(self, system: str, backend: str, version: str, systems_dir: str = "
             self._custom_allreduce_data = load_custom_allreduce_data(
                 os.path.join(data_dir, common.PerfDataFilename.custom_allreduce.value)
             )
+            self._moe_data, _ = load_moe_data(os.path.join(data_dir, common.PerfDataFilename.moe.value))
+            self._nccl_data = load_nccl_data(nccl_data_dir)
         else:  # TRTLLM
             self._gemm_data = load_gemm_data(os.path.join(data_dir, common.PerfDataFilename.gemm.value))
             self._context_attention_data = load_context_attention_data(
@@ -2590,6 +2592,13 @@ def get_sol(
                 num_left, num_right = self._nearest_1d_point_helper(num_tokens, list(moe_dict.keys()), inner_only=False)
                 lat = self._interp_1d([num_left, num_right], [moe_dict[num_left], moe_dict[num_right]], num_tokens)
                 return lat
+            elif self.backend == common.BackendName.vllm.value:
+                moe_dict = self._moe_data[quant_mode][workload_distribution][topk][num_experts][hidden_size][
+                    inter_size
+                ][moe_tp_size][moe_ep_size]
+                num_left, num_right = self._nearest_1d_point_helper(num_tokens, list(moe_dict.keys()), inner_only=False)
+                latency = self._interp_1d([num_left, num_right], [moe_dict[num_left], moe_dict[num_right]], num_tokens)
+                return latency
             else:
                 raise NotImplementedError(f"backend {self.backend} not supported for moe")
 
diff --git a/src/aiconfigurator/sdk/task.py b/src/aiconfigurator/sdk/task.py
@@ -436,7 +436,6 @@ def _get_quant_mode(
         use_specific_quant_mode: str | None = None,
     ) -> tuple[str, str, str, str, str]:
         gemm_quant_mode = "fp8_block"
-        moe_quant_mode = "fp8_block"
         kvcache_quant_mode = "fp8"
         fmha_quant_mode = "float16" if model_name in ["DEEPSEEK_V3", "KIMI_K2"] else "fp8"
         comm_quant_mode = "half"
@@ -469,13 +468,14 @@ def _get_quant_mode(
 
         if model_name in ["DEEPSEEK_V3", "KIMI_K2"]:
             fmha_quant_mode = "float16"
+
         if (
             any(keyword in model_name for keyword in ["MOE_Mixtral", "QWEN2", "LLAMA"])
             and sm_version < 100
             and sm_version >= 89
         ):
-            gemm_quant_mode = "fp8"
-            moe_quant_mode = "fp8"
+            gemm_quant_mode = fp8_gemm_quant
+            moe_quant_mode = fp8_gemm_quant
 
         if use_specific_quant_mode is not None:
             if use_specific_quant_mode != "w4afp8":
@@ -730,8 +730,8 @@ def validate(self):
         """
         Check that the task can be run by AIC.
         """
-        if check_is_moe(self.model_name) and self.backend_name == "vllm":
-            raise NotImplementedError("AIConfigurator does not yet support MOE models for VLLM backend.")
+        if self.backend_name == "vllm" and get_model_family(self.model_name) == "DEEPSEEK":
+            raise NotImplementedError("AIConfigurator does not yet support DEEPSEEK models for VLLM backend.")
 
     def pretty(self) -> str:
         def _convert(obj: Any) -> Any:
diff --git a/src/aiconfigurator/systems/data/h100_sxm/vllm/0.11.0/moe_perf.txt b/src/aiconfigurator/systems/data/h100_sxm/vllm/0.11.0/moe_perf.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:adb518cbf8558ed461469cde875d81a9e36702945d0046f0fd45aae3cc98b2d1
+size 1048377
diff --git a/tools/sanity_check/validate_database.ipynb b/tools/sanity_check/validate_database.ipynb
@@ -14,8 +14,9 @@
     "from aiconfigurator.sdk import common\n",
     "from aiconfigurator.sdk.perf_database import get_database\n",
     "\n",
-    "system = \"gb200_sxm\"\n",
-    "database = get_database(system=system, backend=\"trtllm\", version=\"1.0.0rc6\")"
+    "system = \"h100_sxm\"\n",
+    "# database = get_database(system=system, backend=\"trtllm\", version=\"1.0.0rc3\")\n",
+    "database = get_database(system=system, backend=\"vllm\", version=\"0.11.0\")"
    ]
   },
   {
@@ -563,6 +564,8 @@
     "    tp_ep_list = []\n",
     "    for tp in tp_list:\n",
     "        for ep in ep_list:\n",
+    "            if database.backend == \"vllm\" and tp > 1 and ep > 1:\n",
+    "                continue\n",
     "            if tp * ep >= 4 and tp * ep <= 16:\n",
     "                tp_ep_list.append([tp, ep])\n",
     "    m_list = [1, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 65536 * 4]\n",
@@ -775,7 +778,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "myenv",
    "language": "python",
    "name": "python3"
   },
@@ -789,7 +792,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.10.17"
   }
  },
  "nbformat": 4,

Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,7 @@ ignore = [`
`146`	`146`	`"RUF059", # unpacked variable is never used`
`147`	`147`	`"UP007", # require using X \| Y for type annotations`
`148`	`148`	`"UP045", # require using X \| None for type annotations`
	`149`	`+ "B023", # Function definition does not bind loop variable`
`149`	`150`	`]`
`150`	`151`
`151`	`152`	`[tool.ruff.lint.isort]`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+version https://git-lfs.github.com/spec/v1`
	`2`	`+oid sha256:adb518cbf8558ed461469cde875d81a9e36702945d0046f0fd45aae3cc98b2d1`
	`3`	`+size 1048377`