fix

lyogavin · lyogavin · commit 001e0e03256e · 2025-11-24T22:05:46.000-06:00
diff --git a/comfy/int8_kernels.py b/comfy/int8_kernels.py
@@ -429,23 +429,24 @@ def int8_gemm_addmm_kernel(
     b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
     a_s_ptrs = a_s_ptr + offs_m * k
     
-    # Weight scale indexing: b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K)
-    # For this N tile (pid_n), we need scales[pid_n, :] across K iterations
-    b_s_base = b_s_ptr + pid_n * k
+    # FIXED: Weight scale indexing for 2D scale array (N_blocks, K_blocks)
+    # b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K) stored in row-major
+    # For N tile pid_n, we need scales[pid_n, :] across K iterations
+    # Address calculation: scale[pid_n, i] = base + pid_n * stride + i
+    k_blocks = k  # Number of K blocks for clarity
+    b_s_base = b_s_ptr + pid_n * k_blocks
 
     # Accumulate matmul result
     # Create accumulators outside the loop for better performance
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    #acc_int32 = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.int32)
-    for i in range(k):
+    for i in range(k_blocks):
         # Load int8 data - use other=0 (int) not 0.0 (float) to preserve int8 type
         a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0)
         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0)
         a_s = tl.load(a_s_ptrs)
-        # Load single scalar weight scale for this (N block, K block) pair
+        # FIXED: Load single scalar weight scale for (pid_n, i) block pair
         b_s = tl.load(b_s_base + i)
         # INT8 matmul → INT32 acc, then cast to FP32 and apply per-block scaling
-        # Use explicit int32 accumulator to ensure int8 × int8 → int32 accumulation
         dot_prod = tl.dot(a, b, out_dtype=tl.int32)  # int8 × int8 → int32
         accumulator += dot_prod.to(tl.float32) * a_s[:, None] * b_s
         a_ptrs += BLOCK_SIZE_K
@@ -670,17 +671,19 @@ def int8_gemm_quant_kernel(
     b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
     a_s_ptrs = a_s_ptr + offs_m * k
     
-    # Weight scale indexing: b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K)
-    # For this N tile (pid_n), we need scales[pid_n, :] across K iterations
-    b_s_base = b_s_ptr + pid_n * k
+    # FIXED: Weight scale indexing for 2D scale array (N_blocks, K_blocks)
+    # b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K) stored in row-major
+    # For N tile pid_n, we need scales[pid_n, :] across K iterations
+    k_blocks = k  # Number of K blocks for clarity
+    b_s_base = b_s_ptr + pid_n * k_blocks
 
     # Accumulate matmul result
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for i in range(k):
+    for i in range(k_blocks):
         a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0)
         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0)
         a_s = tl.load(a_s_ptrs)
-        # Load single scalar weight scale for this (N block, K block) pair
+        # FIXED: Load single scalar weight scale for (pid_n, i) block pair
         b_s = tl.load(b_s_base + i)
         dot_prod = tl.dot(a, b, out_dtype=tl.int32)
         accumulator += dot_prod.to(tl.float32) * a_s[:, None] * b_s
@@ -783,17 +786,19 @@ def int8_gemm_addmm_quant_kernel(
     b_ptrs = b_ptr + offs_n[None, :] * K + offs_k[:, None]
     a_s_ptrs = a_s_ptr + offs_m * k
     
-    # Weight scale indexing: b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K)
-    # For this N tile (pid_n), we need scales[pid_n, :] across K iterations
-    b_s_base = b_s_ptr + pid_n * k
+    # FIXED: Weight scale indexing for 2D scale array (N_blocks, K_blocks)
+    # b_s has shape (N//BLOCK_SIZE_K, K//BLOCK_SIZE_K) stored in row-major
+    # For N tile pid_n, we need scales[pid_n, :] across K iterations
+    k_blocks = k  # Number of K blocks for clarity
+    b_s_base = b_s_ptr + pid_n * k_blocks
 
     # Accumulate matmul result
     accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32)
-    for i in range(k):
+    for i in range(k_blocks):
         a = tl.load(a_ptrs, mask=offs_k[None, :] < K - i * BLOCK_SIZE_K, other=0)
         b = tl.load(b_ptrs, mask=offs_k[:, None] < K - i * BLOCK_SIZE_K, other=0)
         a_s = tl.load(a_s_ptrs)
-        # Load single scalar weight scale for this (N block, K block) pair
+        # FIXED: Load single scalar weight scale for (pid_n, i) block pair
         b_s = tl.load(b_s_base + i)
         dot_prod = tl.dot(a, b, out_dtype=tl.int32)
         accumulator += dot_prod.to(tl.float32) * a_s[:, None] * b_s