debug

lyogavin · lyogavin · commit d24f73da6b87 · 2025-11-24T22:01:32.000-06:00
diff --git a/comfy/int8_kernels.py b/comfy/int8_kernels.py
@@ -348,22 +348,23 @@ def int8_gemm_kernel(
     b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
     a_s_ptrs = a_s_ptr + offs_m * k
     
-    # Weight scale indexing: b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K)
-    # For this N tile (pid_n), we need scales[pid_n, :] across K iterations
-    b_s_base = b_s_ptr + pid_n * k
+    # FIXED: Weight scale indexing for 2D scale array (N_blocks, K_blocks)
+    # b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K) stored in row-major
+    # For N tile pid_n, we need scales[pid_n, :] across K iterations
+    # Address calculation: scale[pid_n, i] = base + pid_n * stride + i
+    k_blocks = k  # Number of K blocks for clarity
+    b_s_base = b_s_ptr + pid_n * k_blocks
 
     # Create accumulators outside the loop for better performance
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    #acc_int32 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.int32)
-    for i in range(k):
+    for i in range(k_blocks):
         # Load int8 data - use other=0 (int) not 0.0 (float) to preserve int8 type
         a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0)
         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0)
         a_s = tl.load(a_s_ptrs)
-        # Load single scalar weight scale for this (N block, K block) pair
+        # FIXED: Load single scalar weight scale for (pid_n, i) block pair
         b_s = tl.load(b_s_base + i)
         # INT8 matmul → INT32 acc, then cast to FP32 and apply per-block scaling
-        # Use explicit int32 accumulator to ensure int8 × int8 → int32 accumulation
         dot_prod = tl.dot(a, b, out_dtype=tl.int32)  # int8 × int8 → int32
         accumulator += dot_prod.to(tl.float32) * a_s[:, None] * b_s
         a_ptrs += BLOCK_SIZE_K
diff --git a/debug_int8_gemm.py b/debug_int8_gemm.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+"""
+Debug script to test INT8 GEMM with simple known values.
+This will help us understand what's going wrong.
+"""
+import torch
+import sys
+
+# Add comfy to path
+sys.path.insert(0, '/Users/l_y_o/Work/ComfyUI')
+
+from comfy.quant_ops import _int8_gemm_pytorch_fallback, _int8_gemm_triton_or_fallback
+
+def test_simple_case():
+    """Test with very simple values to see the difference"""
+    device = torch.device('cuda')
+    block_size = 128
+    
+    # Very simple case: 1 batch, small dimensions
+    M, K, N = 128, 256, 256
+    
+    # Create simple int8 data: all ones
+    input_int8 = torch.ones((M, K), dtype=torch.int8, device=device)
+    weight_int8 = torch.ones((N, K), dtype=torch.int8, device=device)
+    
+    # Create simple scales: all 0.01
+    input_scale = torch.full((M, K // block_size), 0.01, dtype=torch.float32, device=device)
+    weight_scale = torch.full((N // block_size, K // block_size), 0.01, dtype=torch.float32, device=device)
+    
+    # No bias for simplicity
+    bias = None
+    
+    print("=" * 80)
+    print("SIMPLE TEST CASE: all ones, scales=0.01")
+    print("=" * 80)
+    print(f"Input shape: {input_int8.shape}, scales: {input_scale.shape}")
+    print(f"Weight shape: {weight_int8.shape}, scales: {weight_scale.shape}")
+    print(f"Expected: Each output element = sum(1*0.01 * 1*0.01 for k in range(K))")
+    print(f"         = K * (0.01 * 0.01) = {K} * 0.0001 = {K * 0.0001}")
+    print()
+    
+    # Method 1: Triton
+    try:
+        output_triton = _int8_gemm_triton_or_fallback(
+            input_int8, input_scale, weight_int8, weight_scale, block_size, bias=bias, out_quant=False
+        )
+        print(f"Triton output sample (first 5): {output_triton[0, :5].cpu()}")
+        print(f"Triton output mean: {output_triton.mean().item():.6f}")
+        print(f"Triton output [0,0]: {output_triton[0, 0].item():.6f}")
+    except Exception as e:
+        print(f"Triton failed: {e}")
+        output_triton = None
+    
+    # Method 2: PyTorch
+    output_pytorch = _int8_gemm_pytorch_fallback(
+        input_int8, input_scale, weight_int8, weight_scale, block_size, bias=bias
+    )
+    print(f"\nPyTorch output sample (first 5): {output_pytorch[0, :5].cpu()}")
+    print(f"PyTorch output mean: {output_pytorch.mean().item():.6f}")
+    print(f"PyTorch output [0,0]: {output_pytorch[0, 0].item():.6f}")
+    
+    if output_triton is not None:
+        diff = (output_triton.float() - output_pytorch.float()).abs()
+        print(f"\nDifference mean: {diff.mean().item():.6f}")
+        print(f"Difference max: {diff.max().item():.6f}")
+        print(f"Difference [0,0]: {diff[0, 0].item():.6f}")
+    
+    print("\n" + "=" * 80)
+
+
+def test_scale_loading():
+    """Test to see which scales are being used"""
+    device = torch.device('cuda')
+    block_size = 128
+    
+    M, K, N = 128, 256, 256
+    
+    # Create int8 data: all ones
+    input_int8 = torch.ones((M, K), dtype=torch.int8, device=device)
+    weight_int8 = torch.ones((N, K), dtype=torch.int8, device=device)
+    
+    # Create UNIQUE scales to trace which ones are being used
+    # Input scales: [0.01, 0.02] for the two K blocks
+    input_scale = torch.tensor([[0.01, 0.02]] * M, dtype=torch.float32, device=device)
+    
+    # Weight scales: unique value for each position
+    # Shape: (N//block_size, K//block_size) = (2, 2)
+    weight_scale = torch.tensor([
+        [0.10, 0.20],  # N-block 0: K-block 0=0.10, K-block 1=0.20
+        [0.30, 0.40],  # N-block 1: K-block 0=0.30, K-block 1=0.40
+    ], dtype=torch.float32, device=device)
+    
+    print("=" * 80)
+    print("SCALE LOADING TEST: unique scales to trace usage")
+    print("=" * 80)
+    print(f"Input scales shape: {input_scale.shape}")
+    print(f"  Values: [0.01, 0.02] for K-blocks [0, 1]")
+    print(f"\nWeight scales shape: {weight_scale.shape}")
+    print(f"  N-block 0: K-blocks [0.10, 0.20]")
+    print(f"  N-block 1: K-blocks [0.30, 0.40]")
+    print()
+    print("For output[i, j], we should get:")
+    print("  j in [0:128] (N-block 0): sum of [block0: 128*1*0.01*1*0.10, block1: 128*1*0.02*1*0.20]")
+    print("                           = 128*0.001 + 128*0.004 = 0.128 + 0.512 = 0.640")
+    print("  j in [128:256] (N-block 1): sum of [block0: 128*1*0.01*1*0.30, block1: 128*1*0.02*1*0.40]")
+    print("                             = 128*0.003 + 128*0.008 = 0.384 + 1.024 = 1.408")
+    print()
+    
+    # PyTorch reference
+    output_pytorch = _int8_gemm_pytorch_fallback(
+        input_int8, input_scale, weight_int8, weight_scale, block_size, bias=None
+    )
+    
+    print("PyTorch output:")
+    print(f"  output[0, 0] (N-block 0): {output_pytorch[0, 0].item():.6f} (expected: 0.640)")
+    print(f"  output[0, 128] (N-block 1): {output_pytorch[0, 128].item():.6f} (expected: 1.408)")
+    print(f"  Mean of N-block 0: {output_pytorch[0, :128].mean().item():.6f}")
+    print(f"  Mean of N-block 1: {output_pytorch[0, 128:].mean().item():.6f}")
+    
+    # Triton
+    try:
+        output_triton = _int8_gemm_triton_or_fallback(
+            input_int8, input_scale, weight_int8, weight_scale, block_size, bias=None, out_quant=False
+        )
+        
+        print("\nTriton output:")
+        print(f"  output[0, 0] (N-block 0): {output_triton[0, 0].item():.6f} (expected: 0.640)")
+        print(f"  output[0, 128] (N-block 1): {output_triton[0, 128].item():.6f} (expected: 1.408)")
+        print(f"  Mean of N-block 0: {output_triton[0, :128].mean().item():.6f}")
+        print(f"  Mean of N-block 1: {output_triton[0, 128:].mean().item():.6f}")
+        
+        # Compare
+        diff = (output_triton.float() - output_pytorch.float()).abs()
+        print(f"\nDifference:")
+        print(f"  [0, 0]: {diff[0, 0].item():.6f}")
+        print(f"  [0, 128]: {diff[0, 128].item():.6f}")
+        print(f"  Mean: {diff.mean().item():.6f}, Max: {diff.max().item():.6f}")
+        
+    except Exception as e:
+        print(f"\nTriton failed: {e}")
+        import traceback
+        traceback.print_exc()
+    
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    if torch.cuda.is_available():
+        print("CUDA available, running tests...\n")
+        test_simple_case()
+        print("\n")
+        test_scale_loading()
+    else:
+        print("CUDA not available, skipping tests")
+