update vllm moe collect and helper for sm ver detection

davilu-nvidia · ilyasher · commit e4de7f094923 · 2025-11-21T11:53:33.000-08:00
diff --git a/collector/.gitignore b/collector/.gitignore
@@ -0,0 +1,3 @@
+moe_perf.txt
+*.log
+moe_*/
diff --git a/collector/collect.py b/collector/collect.py
@@ -427,7 +427,7 @@ def collect_sglang(num_processes: int, ops: list[str] | None = None):
 
 def collect_vllm(num_processes: int, ops: list[str] | None = None):
     """
-    Collect performance data for VLLM v1.
+    Collect performance data for VLLM 
     """
 
     try:
@@ -441,7 +441,7 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):
 
     collections = [
         # GEMM collections
-        # vllm v1 GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
+        # vllm GEMM collection for fp16, fp8, fp8_block, nvfp4, awq, and gptq
         {
             "name": "vllm",
             "type": "gemm",
@@ -464,6 +464,13 @@ def collect_vllm(num_processes: int, ops: list[str] | None = None):
             "get_func": "get_generation_attention_test_cases",
             "run_func": "run_attention_torch",
         },
+        {
+            "name": "vllm",
+            "type": "moe",
+            "module": "collector.vllm.collect_moe",
+            "get_func": "get_moe_test_cases",
+            "run_func": "run_moe_torch",
+        },
     ]
 
     all_errors = collect_ops(num_processes, collections, ops, version)
diff --git a/collector/helper.py b/collector/helper.py
@@ -215,23 +215,63 @@ def save_error_report(errors, filename):
     with open(filename, "w") as f:
         json.dump(errors, f, indent=2)
 
-
 def get_sm_version():
-    # Init
-    (err,) = cuda.cuInit(0)
-
-    # Device
-    err, cu_device = cuda.cuDeviceGet(0)
-
-    # Get target architecture
-    err, sm_major = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
-    )
-    err, sm_minor = cuda.cuDeviceGetAttribute(
-        cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
-    )
-
-    return sm_major * 10 + sm_minor
+    """Get CUDA compute capability (SM version)"""
+    try:
+        import torch
+        if torch.cuda.is_available():
+            device = torch.cuda.current_device()
+            capability = torch.cuda.get_device_capability(device)
+            return capability[0] * 10 + capability[1]
+    except Exception:
+        pass
+    
+    # fallback to cuda-python
+    try:
+        from cuda import cuda
+        # Init
+        (err,) = cuda.cuInit(0)
+        if err != 0:
+            raise RuntimeError(f"cuInit failed with error code: {err}")
+        
+        # Device
+        err, cu_device = cuda.cuDeviceGet(0)
+        if err != 0:
+            raise RuntimeError(f"cuDeviceGet failed with error code: {err}")
+        
+        # Get target architecture
+        err, sm_major = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 
+            cu_device
+        )
+        err, sm_minor = cuda.cuDeviceGetAttribute(
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 
+            cu_device
+        )
+        
+        return sm_major * 10 + sm_minor
+    except Exception as e:
+        raise RuntimeError(
+            f"Cannot get SM version: both PyTorch and cuda-python failed. "
+            f"Error: {e}"
+        ) from e
+
+# def get_sm_version():
+#     # Init
+#     (err,) = cuda.cuInit(0)
+
+#     # Device
+#     err, cu_device = cuda.cuDeviceGet(0)
+
+#     # Get target architecture
+#     err, sm_major = cuda.cuDeviceGetAttribute(
+#         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
+#     )
+#     err, sm_minor = cuda.cuDeviceGetAttribute(
+#         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
+#     )
+
+#     return sm_major * 10 + sm_minor
 
 
 def create_test_case_id(test_case, test_type, module_name):