ai-dynamo
diff --git a/‎collector/collect.py‎
Lines changed: 8 additions & 1 deletion b/‎collector/collect.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎collector/helper.py‎
Lines changed: 10 additions & 33 deletions b/‎collector/helper.py‎
Lines changed: 10 additions & 33 deletions
@@ -128,6 +128,12 @@ def worker(queue, device_id: int, func, progress_value, lock, error_queue=None,
             for handler in worker_logger.handlers:
                 handler.flush()
 
+            # This error is could be fatal and require a process restart.
+            if isinstance(e, torch.AcceleratorError):
+                # Exiting with non-zero code will add an additional error to the summary,
+                # which we don't want.
+                exit(0)
+
 
 def parallel_run(tasks, func, num_processes, module_name="unknown"):
     """parallel runner with error collection"""
@@ -329,6 +335,7 @@ def collect_ops(
                     "traceback": traceback.format_exc(),
                 }
             )
+    return all_errors
 
     return all_errors
 
@@ -427,7 +434,7 @@ def collect_sglang(num_processes: int, ops: list[str] | None = None):
 
 def collect_vllm(num_processes: int, ops: list[str] | None = None):
     """
-    Collect performance data for VLLM 
+    Collect performance data for VLLM
     """
 
     try:
 
@@ -10,10 +10,6 @@
 import sys
 import traceback
 
-try:
-    from cuda import cuda
-except:
-    from cuda.bindings import driver as cuda
 from datetime import datetime
 from pathlib import Path
 
@@ -215,63 +211,44 @@ def save_error_report(errors, filename):
     with open(filename, "w") as f:
         json.dump(errors, f, indent=2)
 
+
 def get_sm_version():
     """Get CUDA compute capability (SM version)"""
     try:
         import torch
+
         if torch.cuda.is_available():
             device = torch.cuda.current_device()
             capability = torch.cuda.get_device_capability(device)
             return capability[0] * 10 + capability[1]
     except Exception:
         pass
-    
+
     # fallback to cuda-python
     try:
         from cuda import cuda
+
         # Init
         (err,) = cuda.cuInit(0)
         if err != 0:
             raise RuntimeError(f"cuInit failed with error code: {err}")
-        
+
         # Device
         err, cu_device = cuda.cuDeviceGet(0)
         if err != 0:
             raise RuntimeError(f"cuDeviceGet failed with error code: {err}")
-        
+
         # Get target architecture
         err, sm_major = cuda.cuDeviceGetAttribute(
-            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, 
-            cu_device
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
         )
         err, sm_minor = cuda.cuDeviceGetAttribute(
-            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, 
-            cu_device
+            cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
         )
-        
+
         return sm_major * 10 + sm_minor
     except Exception as e:
-        raise RuntimeError(
-            f"Cannot get SM version: both PyTorch and cuda-python failed. "
-            f"Error: {e}"
-        ) from e
-
-# def get_sm_version():
-#     # Init
-#     (err,) = cuda.cuInit(0)
-
-#     # Device
-#     err, cu_device = cuda.cuDeviceGet(0)
-
-#     # Get target architecture
-#     err, sm_major = cuda.cuDeviceGetAttribute(
-#         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, cu_device
-#     )
-#     err, sm_minor = cuda.cuDeviceGetAttribute(
-#         cuda.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, cu_device
-#     )
-
-#     return sm_major * 10 + sm_minor
+        raise RuntimeError(f"Cannot get SM version: both PyTorch and cuda-python failed. Error: {e}") from e
 
 
 def create_test_case_id(test_case, test_type, module_name):