Improve VRAM tracking

guill · guill · commit f711f6a34ac2 · 2025-06-30T16:50:47.000-07:00
diff --git a/benchmarks/benchmark.py b/benchmarks/benchmark.py
@@ -243,16 +243,12 @@ def example_entrypoint():
                     ("medium_tensor", (512, 512)),  # 262K elements, ~1MB
                     ("large_tensor", (1024, 1024)),  # 1M elements, ~4MB
                     ("image_8k", (3, 8192, 8192)),  # 201M elements, ~800MB (8K RGB image)
-                    ("model_6gb", (40132, 40132)),  # 1.6B elements, ~6GB (modern LLM/diffusion model)
                 ]
 
                 # Create CPU tensors and add to test data
                 for name, size in tensor_specs:
                     try:
-                        if name == "model_6gb":
-                            print(f"  Creating {name} tensor {size} (WARNING: This will use ~6GB RAM)...")
-                        else:
-                            print(f"  Creating {name} tensor {size}...")
+                        print(f"  Creating {name} tensor {size}...")
 
                         with torch.inference_mode():
                             tensor = torch.randn(*size)
@@ -265,7 +261,7 @@ def example_entrypoint():
                         if not no_gpu and torch.cuda.is_available():
                             try:
                                 # Skip GPU for very large tensors to avoid OOM
-                                if name == "image_8k" or name == "model_6gb":
+                                if name == "image_8k":
                                     print(f"    Creating GPU version of {name} (may use significant VRAM)...")
                                     with torch.inference_mode():
                                         gpu_tensor = tensor.cuda()
@@ -303,6 +299,30 @@ def example_entrypoint():
             except MemoryError as e:
                 print(f"  Skipping {name}: {e}")
 
+        # Add the 6GB model test at the very end if torch is available
+        if torch_available and not no_torch:
+            try:
+                print("  Creating model_6gb tensor (40132, 40132) (WARNING: This will use ~6GB RAM)...")
+                with torch.inference_mode():
+                    model_6gb_tensor = torch.randn(40132, 40132)
+                test_data.append(("model_6gb_cpu", model_6gb_tensor))
+
+                size_gb = (model_6gb_tensor.numel() * 4) / (1024**3)
+                print(f"    CPU tensor created successfully ({size_gb:.2f}GB)")
+
+                # Try GPU version if available
+                if not no_gpu and torch.cuda.is_available():
+                    try:
+                        print("    Creating GPU version of model_6gb (may use significant VRAM)...")
+                        with torch.inference_mode():
+                            gpu_tensor = model_6gb_tensor.cuda()
+                        test_data.append(("model_6gb_gpu", gpu_tensor))
+                        print("    GPU tensor created successfully")
+                    except RuntimeError as gpu_e:
+                        print(f"    GPU tensor failed: {gpu_e}")
+            except RuntimeError as e:
+                print(f"  Skipping model_6gb: {e}")
+
         from tests.test_benchmarks import BenchmarkRunner
 
         runner = BenchmarkRunner(warmup_runs=2 if quick else 5, benchmark_runs=100 if quick else 1000)
diff --git a/benchmarks/memory_benchmark.py b/benchmarks/memory_benchmark.py
@@ -57,13 +57,17 @@ def __init__(self):
         self.process = psutil.Process()
         self.nvml_initialized = False
         self.gpu_handle = None
+        self.baseline_gpu_memory_mb = 0
 
         if NVML_AVAILABLE and nvml:
             try:
                 nvml.nvmlInit()
                 self.nvml_initialized = True
                 # Get the first GPU
                 self.gpu_handle = nvml.nvmlDeviceGetHandleByIndex(0)
+                # Store baseline GPU memory usage
+                mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
+                self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024
             except Exception as e:
                 print(f"Failed to initialize NVML: {e}")
                 self.nvml_initialized = False
@@ -113,34 +117,55 @@ def get_memory_usage(self) -> dict[str, float]:
         except Exception as e:
             print(f"Error getting RAM usage: {e}")
 
-        # Get total GPU memory usage (not per-process)
+        # Get GPU memory usage - use total system VRAM since extensions run in separate processes
         if self.nvml_initialized and self.gpu_handle:
             try:
                 # Get total GPU memory info
                 mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
-                memory_info["gpu_used_mb"] = mem_info.used / 1024 / 1024
+                current_used_mb = mem_info.used / 1024 / 1024
+                memory_info["gpu_used_mb"] = current_used_mb
                 memory_info["gpu_total_mb"] = mem_info.total / 1024 / 1024
-                memory_info["total_vram_mb"] = memory_info["gpu_used_mb"]
+                memory_info["total_vram_mb"] = current_used_mb
 
-                # Try to get per-process info (might return 0)
-                try:
-                    processes = nvml.nvmlDeviceGetComputeRunningProcesses(self.gpu_handle)
-                    our_pids = set(self.get_process_tree_pids())
-
-                    for proc in processes:
-                        if proc.pid in our_pids and proc.usedGpuMemory is not None:
-                            vram_mb = proc.usedGpuMemory / 1024 / 1024
-                            if proc.pid == self.process.pid:
-                                memory_info["host_vram_mb"] = vram_mb
-                except Exception as e:
-                    # Per-process tracking failed, use total GPU memory instead
-                    print(f"Warning: Per-process GPU tracking failed: {e}", file=sys.stderr)
+                # Calculate VRAM usage relative to baseline (captures all processes)
+                # This is more reliable than per-process tracking, especially on Windows
+                vram_delta = current_used_mb - self.baseline_gpu_memory_mb
+                memory_info["host_vram_mb"] = max(0, vram_delta)
 
             except Exception as e:
                 print(f"Error getting GPU memory usage: {e}")
 
+        # Fallback: try PyTorch CUDA memory for current process if NVML failed
+        elif CUDA_AVAILABLE and torch.cuda.is_available():
+            try:
+                # This only captures current process, but better than nothing
+                allocated_mb = torch.cuda.memory_allocated() / 1024 / 1024
+                reserved_mb = torch.cuda.memory_reserved() / 1024 / 1024
+
+                memory_info["host_vram_mb"] = allocated_mb
+                memory_info["total_vram_mb"] = allocated_mb
+                memory_info["pytorch_reserved_mb"] = reserved_mb
+
+                print(
+                    "Warning: Using PyTorch CUDA memory (current process only): "
+                    + f"{allocated_mb:.1f} MB allocated",
+                    file=sys.stderr,
+                )
+
+            except Exception as e:
+                print(f"Error getting PyTorch CUDA memory: {e}")
+
         return memory_info
 
+    def reset_baseline(self):
+        """Reset the baseline GPU memory measurement."""
+        if self.nvml_initialized and self.gpu_handle:
+            try:
+                mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
+                self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024
+            except Exception as e:
+                print(f"Error resetting GPU memory baseline: {e}")
+
     def __del__(self):
         """Cleanup NVML on deletion."""
         if self.nvml_initialized: