fix

lyogavin · lyogavin · commit 48bd2f969961 · 2025-11-24T22:14:44.000-06:00
diff --git a/debug_int8_gemm.py b/debug_int8_gemm.py
@@ -148,12 +148,108 @@ def test_scale_loading():
     print("=" * 80)
 
 
+def test_exact_failing_case():
+    """Reproduce the EXACT test case that's failing"""
+    device = torch.device('cuda')
+    block_size = 128
+    torch.manual_seed(123)
+    
+    batch_size = 2
+    seq_len = 8
+    in_features = 256
+    out_features = 512
+    
+    print("=" * 80)
+    print("EXACT FAILING TEST CASE: Reproduce test_triton_linear_from_raw_int8_and_scales")
+    print("=" * 80)
+    
+    # Manually create int8 data and scales for input (activation)
+    # Input shape: (batch_size, seq_len, in_features)
+    input_int8 = torch.randint(-127, 127, (batch_size, seq_len, in_features),
+                               dtype=torch.int8, device=device)
+    input_scale = torch.rand(batch_size, seq_len, in_features // block_size,
+                            dtype=torch.float32, device=device) * 0.1
+    
+    # Manually create int8 data and scales for weight
+    # Weight shape: (out_features, in_features)
+    weight_int8 = torch.randint(-127, 127, (out_features, in_features),
+                               dtype=torch.int8, device=device)
+    weight_scale = torch.rand(out_features // block_size, in_features // block_size,
+                             dtype=torch.float32, device=device) * 0.1
+    
+    # Bias
+    bias = torch.randn(out_features, dtype=torch.float32, device=device)
+    
+    print(f"Input shape: {input_int8.shape}")
+    print(f"Input scale shape: {input_scale.shape}")
+    print(f"Weight shape: {weight_int8.shape}")
+    print(f"Weight scale shape: {weight_scale.shape}")
+    print(f"Bias shape: {bias.shape}")
+    print()
+    
+    # Method 1: Call INT8 GEMM via Triton/fallback
+    print("Calling Triton/fallback...")
+    output_triton = _int8_gemm_triton_or_fallback(
+        input_int8, input_scale, weight_int8, weight_scale, block_size, bias=bias, out_quant=False
+    )
+    
+    # Method 2: Call PyTorch INT8 GEMM fallback directly
+    print("Calling PyTorch fallback...")
+    output_pytorch = _int8_gemm_pytorch_fallback(
+        input_int8, input_scale, weight_int8, weight_scale, block_size, bias=bias
+    )
+    
+    # Convert all to float32 for fair comparison
+    output_triton_fp32 = output_triton.to(torch.float32)
+    output_pytorch_fp32 = output_pytorch.to(torch.float32)
+    
+    # Compare Method 1 vs Method 2: Triton vs PyTorch INT8 GEMM
+    abs_diff = (output_triton_fp32 - output_pytorch_fp32).abs()
+    mean_abs_diff = abs_diff.mean().item()
+    max_abs_diff = abs_diff.max().item()
+    
+    print(f"\nComparison:")
+    print(f"  Output shape: {output_triton.shape}")
+    print(f"  Triton sample [0,0,:5]: {output_triton[0,0,:5].cpu()}")
+    print(f"  PyTorch sample [0,0,:5]: {output_pytorch[0,0,:5].cpu()}")
+    print(f"  Difference [0,0,:5]: {abs_diff[0,0,:5].cpu()}")
+    print(f"\n  Mean absolute difference: {mean_abs_diff:.6f}")
+    print(f"  Max absolute difference: {max_abs_diff:.6f}")
+    print(f"  Test threshold: 0.001000")
+    print(f"  PASS: {mean_abs_diff < 1e-3}")
+    
+    # Show where the largest differences are
+    max_idx = abs_diff.argmax()
+    max_idx_flat = max_idx.item()
+    shape = abs_diff.shape
+    # Convert flat index to multi-dimensional index
+    idx_0 = max_idx_flat // (shape[1] * shape[2])
+    idx_1 = (max_idx_flat // shape[2]) % shape[1]
+    idx_2 = max_idx_flat % shape[2]
+    
+    print(f"\n  Max difference location: [{idx_0}, {idx_1}, {idx_2}]")
+    print(f"    Triton value: {output_triton_fp32[idx_0, idx_1, idx_2].item():.6f}")
+    print(f"    PyTorch value: {output_pytorch_fp32[idx_0, idx_1, idx_2].item():.6f}")
+    
+    # Check if there's a pattern by N-block
+    print(f"\n  Difference by N-block:")
+    for n_block in range(out_features // block_size):
+        start = n_block * block_size
+        end = (n_block + 1) * block_size
+        block_diff = abs_diff[:, :, start:end].mean().item()
+        print(f"    N-block {n_block} (outputs {start}:{end}): mean diff = {block_diff:.6f}")
+    
+    print("=" * 80)
+
+
 if __name__ == "__main__":
     if torch.cuda.is_available():
         print("CUDA available, running tests...\n")
         test_simple_case()
         print("\n")
         test_scale_loading()
+        print("\n")
+        test_exact_failing_case()
     else:
         print("CUDA not available, skipping tests")