ai-dynamo · ilyasher · Nov 26, 2025 · Oct 31, 2025 · Nov 21, 2025 · Nov 25, 2025
@@ -0,0 +1,3 @@
+moe_perf.txt
+*.log
+moe_*/
@@ -128,6 +128,12 @@ def worker(queue, device_id: int, func, progress_value, lock, error_queue=None,
             for handler in worker_logger.handlers:
                 handler.flush()
 
+            # This error is could be fatal and require a process restart.
+            if isinstance(e, torch.AcceleratorError):
+                # Exiting with non-zero code will add an additional error to the summary,
+                # which we don't want.
+                exit(0)
+
 
 def parallel_run(tasks, func, num_processes, module_name="unknown"):
     """parallel runner with error collection"""
@@ -427,7 +433,7 @@ def collect_sglang(num_processes: int, ops: list[str] | None = None):
 
 def collect_vllm(num_processes: int, ops: list[str] | None = None):
     """
-    Collect performance data for VLLM v1.
+    Collect performance data for VLLM
     """
 
     try:
@@ -441,7 +447,7 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):
 
     collections = [
         # GEMM collections
-        # vllm v1 GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
+        # vllm GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
         {
             "name": "vllm",
             "type": "gemm",
@@ -464,6 +470,13 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):
             "get_func": "get_generation_attention_test_cases",
             "run_func": "run_attention_torch",
         },
+        {
+            "name": "vllm",
+            "type": "moe",
+            "module": "collector.vllm.collect_moe",
+            "get_func": "get_moe_test_cases",
+            "run_func": "run_moe_torch",
+        },
     ]
 
     all_errors = collect_ops(num_processes, collections, ops, version)

@@ -0,0 +1,132 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import dataclasses
+import itertools
+from typing import Optional
+
+
+@dataclasses.dataclass
+class MoeCommonTestCase:
+    num_tokens_list: list[int]
+    hidden_size: int
+    inter_size: int
+    topk: int
+    num_experts: int
+    tp: int
+    ep: int
+    model_name: str
+    token_expert_distribution: str
+    power_law_alpha: Optional[float]
+
+
+def get_common_moe_test_cases():
+    num_tokens = [
+        1,
+        2,
+        4,
+        8,
+        16,
+        32,
+        48,
+        64,
+        80,
+        96,
+        128,
+        160,
+        192,
+        256,
+        320,
+        384,
+        512,
+        768,
+        1024,
+        1536,
+        2048,
+        3072,
+        4096,
+        6144,
+        8192,
+        12288,
+        16384,
+        20480,
+        32768,
+        65536,
+    ]
+    tp_list = [1, 2, 4, 8, 16, 32]
+    ep_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+    num_gpu_list = [1, 2, 4, 8, 16, 32, 64, 128, 256]
+
+    token_distributions = [
+        ("balanced", 0.0),
+        ("power_law", 1.01),
+        ("power_law", 1.2),
+    ]
+
+    # alpha_list = [1.01, 1.2]
+    # hidden_size,inter_s,topk,num_expert, gated act
+    # [15360,30720,2,16],# GPT-MOE-1.8T
+    # [15360,3840,16,128],# GPT-MOE-1.8T-FineGrained
+    # [3584,2560,8,64],# Qwen2-57B
+    # [2048,1408,4,60], #qwen1.5_moe
+    # [2048,1408,6,64], #deepseekv1_moe
+    # [5120,1536,6,160], #deepseekv2
+    model_config_list = [
+        [4096, 14336, 2, 8, "MOE_Mixtral8x7B"],  # mixtral_8x7b
+        [6144, 16384, 2, 8, "MOE_Mixtral8x22B"],  # mixtral_8x22b
+        [7168, 2048, 8, 256, "DEEPSEEK_V3"],  # deepseekv3, will have 1 shared expert
+        [2048, 768, 8, 128, "QWEN3_30B_A3B"],  # qwen3-moe, 30b-a3b
+        [4096, 1536, 8, 128, "QWEN3_235B"],  # qwen3-moe, 235b-a22b
+        [6144, 2560, 8, 160, "QWEN3_480B"],  # qwen3-moe, 480b-a35b
+        [7168, 2048, 8, 384, "KIMI_K2"],  # kimi k2
+        [2880, 2880, 4, 128, "GPT_OSS_120B"],
+        [2880, 2880, 4, 32, "GPT_OSS_20B"],
+    ]
+
+    test_cases: list[MoeCommonTestCase] = []
+
+    for (
+        num_gpu,  # starting from fewer gpus. workaround for potential buffer bug in moe impl.
+        model_config,
+        tp,
+        ep,
+        (token_distribution, power_law_alpha),
+    ) in itertools.product(
+        num_gpu_list,
+        model_config_list,
+        tp_list,
+        ep_list,
+        token_distributions,
+    ):
+        hs, inter_s, topk, num_experts, model_name = model_config
+
+        # QWEN3_30B_A3B: exclude tp >= 8 as they are not used for actual deployments
+        if model_name == "QWEN3_30B_A3B" and tp >= 8:
+            continue
+
+        if tp * ep != num_gpu:
+            continue
+        if ep > num_experts:
+            continue
+        if num_experts % ep != 0:
+            continue
+        # we need to ensure inter_s can be divided by tp.
+        if inter_s % tp != 0:
+            continue
+
+        test_cases.append(
+            MoeCommonTestCase(
+                num_tokens_list=num_tokens,
+                hidden_size=hs,
+                inter_size=inter_s,
+                topk=topk,
+                num_experts=num_experts,
+                tp=tp,
+                ep=ep,
+                model_name=model_name,
+                token_expert_distribution=token_distribution,
+                power_law_alpha=power_law_alpha,
+            )
+        )
+
+    return test_cases
@@ -10,11 +10,6 @@
 import signal
 import sys
 import traceback
-
-try:
-    from cuda import cuda
-except:
-    from cuda.bindings import driver as cuda
 from datetime import datetime
 from pathlib import Path
 
@@ -218,21 +213,42 @@ def save_error_report(errors, filename):
 
 
 def get_sm_version():
-    # Init
-    (err,) = cuda.cuInit(0)
+    """Get CUDA compute capability (SM version)"""
+    try:
+        import torch
 
-    # Device
-    err, cu_device = cuda.cuDeviceGet(0)
+        if torch.cuda.is_available():
+            device = torch.cuda.current_device()
+            capability = torch.cuda.get_device_capability(device)
+            return capability[0] * 10 + capability[1]
+    except Exception:
+        pass
+
+    # fallback to cuda-python
+    try:
+        from cuda import cuda
 
-    # Get target architecture
-    err, sm_major = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
-    )
-    err, sm_minor = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
-    )
+        # Init
+        (err,) = cuda.cuInit(0)
+        if err != 0:
+            raise RuntimeError(f"cuInit failed with error code: {err}")
+
+        # Device
+        err, cu_device = cuda.cuDeviceGet(0)
+        if err != 0:
+            raise RuntimeError(f"cuDeviceGet failed with error code: {err}")
+
+        # Get target architecture
+        err, sm_major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
+        )
+        err, sm_minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
+        )
 
-    return sm_major * 10 + sm_minor
+        return sm_major * 10 + sm_minor
+    except Exception as e:
+        raise RuntimeError(f"Cannot get SM version: both PyTorch and cuda-python failed. Error: {e}") from e
 
 
 def create_test_case_id(test_case, test_type, module_name):