Comfy-Org
diff --git a/‎BENCHMARK_INSTRUCTIONS.md‎
Lines changed: 117 additions & 0 deletions b/‎BENCHMARK_INSTRUCTIONS.md‎
Lines changed: 117 additions & 0 deletions
diff --git a/‎benchmarks/memory_benchmark.py‎
Lines changed: 68 additions & 3 deletions b/‎benchmarks/memory_benchmark.py‎
Lines changed: 68 additions & 3 deletions
@@ -0,0 +1,117 @@
+# PyIsolate Benchmark Instructions
+
+Thank you for helping collect benchmark data! This document explains how to run the benchmarks on your system.
+
+## Overview
+
+The benchmark scripts will:
+1. Install necessary tools and dependencies
+2. Run performance benchmarks to measure RPC overhead
+3. Run memory benchmarks to measure RAM and VRAM usage
+4. Collect system information
+5. Save all results to a single file
+
+## Prerequisites
+
+- Python 3.9 or higher
+- Internet connection (for downloading dependencies)
+- At least 16GB of RAM recommended (8GB minimum)
+- For GPU benchmarks: NVIDIA GPU with CUDA support (optional)
+
+## Instructions
+
+### Windows Users
+
+1. Download or clone this repository to your local machine
+2. Open Command Prompt (cmd) or PowerShell
+3. Navigate to the pyisolate directory:
+   ```
+   cd path\to\pyisolate
+   ```
+4. Run the benchmark script:
+   ```
+   run_benchmarks_windows.bat
+   ```
+5. Follow the on-screen instructions
+6. When complete, send back the file named `benchmark_results_COMPUTERNAME_TIMESTAMP.txt`
+
+### Linux/macOS Users
+
+1. Download or clone this repository to your local machine
+2. Open Terminal
+3. Navigate to the pyisolate directory:
+   ```
+   cd /path/to/pyisolate
+   ```
+4. Run the benchmark script:
+   ```
+   ./run_benchmarks_linux.sh
+   ```
+5. Follow the on-screen instructions
+6. When complete, send back the file named `benchmark_results_hostname_timestamp.txt`
+
+## What to Expect
+
+- **First Run**: The script will prompt you to install `uv` if it's not already installed
+- **Installation**: The script will automatically install PyTorch with appropriate CUDA support
+- **Duration**: The full benchmark suite takes approximately 10-20 minutes
+- **Memory Usage**: Some tests may use significant RAM (up to 6GB) and VRAM
+- **Errors**: If tests fail due to out-of-memory errors, this is expected and will be noted in the results
+
+## Troubleshooting
+
+### "uv not found" Error
+
+The script requires `uv` for fast package management. Install it using:
+
+**Windows (PowerShell)**:
+```powershell
+powershell -c "irm https://astral.sh/uv/install.ps1 | iex"
+```
+
+**Linux/macOS**:
+```bash
+curl -LsSf https://astral.sh/uv/install.sh | sh
+```
+
+### PyTorch Installation Issues
+
+If PyTorch installation fails:
+1. The script will try to install a CPU-only version automatically
+2. You can manually install PyTorch from https://pytorch.org/get-started/locally/
+3. The benchmarks will still run (with some GPU tests skipped)
+
+### Out of Memory Errors
+
+If you see "CUDA out of memory" or similar errors:
+- This is expected for systems with limited VRAM
+- The script will continue and note which tests failed
+- Results are still valuable!
+
+### Permission Denied (Linux/macOS)
+
+If you get "permission denied" when running the script:
+```bash
+chmod +x run_benchmarks_linux.sh
+./run_benchmarks_linux.sh
+```
+
+## What Data is Collected
+
+The benchmark results file contains:
+- System specifications (OS, CPU, RAM, GPU)
+- Python and package versions
+- Performance benchmark results (RPC call timings)
+- Memory usage measurements
+- Any errors encountered during testing
+
+No personal data is collected.
+
+## Questions?
+
+If you encounter any issues not covered here, please include:
+1. The complete error message
+2. Your operating system and version
+3. Any steps you tried to resolve the issue
+
+Thank you for your help in benchmarking PyIsolate!
@@ -9,6 +9,7 @@
 import argparse
 import asyncio
 import gc
+import platform
 import sys
 import time
 from pathlib import Path
@@ -58,6 +59,7 @@ def __init__(self):
         self.nvml_initialized = False
         self.gpu_handle = None
         self.baseline_gpu_memory_mb = 0
+        self.platform = platform.system()
 
         if NVML_AVAILABLE and nvml:
             try:
@@ -68,8 +70,12 @@ def __init__(self):
                 # Store baseline GPU memory usage
                 mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
                 self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024
+                print(
+                    f"NVML initialized on {self.platform}. "
+                    f"Initial GPU memory: {self.baseline_gpu_memory_mb:.1f} MB"
+                )
             except Exception as e:
-                print(f"Failed to initialize NVML: {e}")
+                print(f"Failed to initialize NVML on {self.platform}: {e}")
                 self.nvml_initialized = False
 
     def get_process_tree_pids(self) -> list[int]:
@@ -132,6 +138,15 @@ def get_memory_usage(self) -> dict[str, float]:
                 vram_delta = current_used_mb - self.baseline_gpu_memory_mb
                 memory_info["host_vram_mb"] = max(0, vram_delta)
 
+                # Debug output for Windows
+                if self.platform == "Windows":
+                    print(
+                        f"[DEBUG Windows] Current GPU: {current_used_mb:.1f} MB, "
+                        f"Baseline: {self.baseline_gpu_memory_mb:.1f} MB, "
+                        f"Delta: {vram_delta:.1f} MB",
+                        file=sys.stderr,
+                    )
+
             except Exception as e:
                 print(f"Error getting GPU memory usage: {e}")
 
@@ -162,9 +177,15 @@ def reset_baseline(self):
         if self.nvml_initialized and self.gpu_handle:
             try:
                 mem_info = nvml.nvmlDeviceGetMemoryInfo(self.gpu_handle)
+                old_baseline = self.baseline_gpu_memory_mb
                 self.baseline_gpu_memory_mb = mem_info.used / 1024 / 1024
+                print(
+                    f"[DEBUG {self.platform}] Reset baseline from {old_baseline:.1f} MB "
+                    f"to {self.baseline_gpu_memory_mb:.1f} MB",
+                    file=sys.stderr,
+                )
             except Exception as e:
-                print(f"Error resetting GPU memory baseline: {e}")
+                print(f"Error resetting GPU memory baseline on {self.platform}: {e}")
 
     def __del__(self):
         """Cleanup NVML on deletion."""
@@ -297,11 +318,23 @@ async def run_scaling_test(
                 ExtensionManagerConfig(venv_root_path=str(self.test_base.test_root / "extension-venvs")),
             )
 
-            # Measure memory before creating extensions
+            # Clean up and reset baseline before measuring
             gc.collect()
             if CUDA_AVAILABLE:
                 torch.cuda.empty_cache()
+                torch.cuda.synchronize()  # Ensure all operations complete
+
+            # Reset GPU memory baseline for this test
+            self.memory_tracker.reset_baseline()
+
+            # Wait a moment for memory to settle
+            await asyncio.sleep(1)
+
             before_memory = self.memory_tracker.get_memory_usage()
+            print(
+                f"Baseline GPU memory: {before_memory.get('gpu_used_mb', 0):.1f} MB "
+                f"(baseline: {self.memory_tracker.baseline_gpu_memory_mb:.1f} MB)"
+            )
 
             # Create and load extensions
             print(f"Creating {num_extensions} extensions...")
@@ -337,12 +370,21 @@ async def run_scaling_test(
             with torch.inference_mode():
                 if use_cuda and CUDA_AVAILABLE:
                     test_tensor = torch.randn(*test_tensor_size, device="cuda")
+                    torch.cuda.synchronize()  # Ensure tensor creation completes
                 else:
                     test_tensor = torch.randn(*test_tensor_size)
 
             tensor_size_mb = test_tensor.element_size() * test_tensor.numel() / (1024 * 1024)
             print(f"Tensor size: {tensor_size_mb:.1f} MB on {test_tensor.device}")
 
+            # Check memory after tensor creation
+            if use_cuda and CUDA_AVAILABLE:
+                post_tensor_memory = self.memory_tracker.get_memory_usage()
+                print(
+                    f"GPU memory after tensor creation: {post_tensor_memory.get('gpu_used_mb', 0):.1f} MB "
+                    f"(delta: {post_tensor_memory.get('host_vram_mb', 0):.1f} MB)"
+                )
+
             # Send tensor to all extensions
             print(f"Sending tensor to {num_extensions} extensions...")
             send_start = time.time()
@@ -352,12 +394,19 @@ async def run_scaling_test(
                     info = await ext.store_tensor(f"test_tensor_{i}", test_tensor)
                     if i == 0:
                         print(f"  First extension stored: {info}")
+                    # Force GPU sync after each send for accurate memory tracking
+                    if use_cuda and CUDA_AVAILABLE:
+                        torch.cuda.synchronize()
                 except Exception as e:
                     print(f"  Failed to send to {ext_name}: {e}")
 
             send_time = time.time() - send_start
             print(f"Send completed in {send_time:.2f}s")
 
+            # Force final sync before measuring
+            if use_cuda and CUDA_AVAILABLE:
+                torch.cuda.synchronize()
+
             # Wait for memory to settle
             await asyncio.sleep(2)
 
@@ -413,6 +462,13 @@ async def run_scaling_test(
             print(f"  RAM per extension: {result['ram_per_extension_mb']:.1f} MB")
             print(f"  RAM for tensor transfer: {result['send_ram_delta_mb']:.1f} MB")
 
+            # Debug GPU memory tracking
+            print("\nGPU Memory Details:")
+            print(f"  Before: {before_memory.get('gpu_used_mb', 0):.1f} MB")
+            print(f"  After Load: {after_load_memory.get('gpu_used_mb', 0):.1f} MB")
+            print(f"  After Send: {after_send_memory.get('gpu_used_mb', 0):.1f} MB")
+            print(f"  Baseline: {self.memory_tracker.baseline_gpu_memory_mb:.1f} MB")
+
             # Show GPU memory if this is a GPU test
             if use_cuda and result["load_gpu_delta_mb"] > 0:
                 print(f"  GPU memory for tensor creation: {result['load_gpu_delta_mb']:.1f} MB")
@@ -426,6 +482,11 @@ async def run_scaling_test(
             # Cleanup
             print("\nCleaning up extensions...")
             manager.stop_all_extensions()
+            del test_tensor
+            gc.collect()
+            if CUDA_AVAILABLE:
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
 
             # Wait for cleanup
             await asyncio.sleep(2)
@@ -850,12 +911,16 @@ def main():
         print("PyTorch not available. Install with: pip install torch")
         return 1
 
+    print(f"Running on: {platform.system()} {platform.release()}")
+
     if not CUDA_AVAILABLE:
         print("CUDA not available. GPU memory tests will be skipped.")
 
     if not NVML_AVAILABLE:
         print("nvidia-ml-py3 not installed. Install with: pip install nvidia-ml-py3")
         print("VRAM tracking will not be available.")
+    else:
+        print("NVML available for GPU memory tracking")
 
     # Determine what to test
     test_small = not args.large_only