default no output quant

lyogavin · lyogavin · commit 1bcfd9c422d9 · 2025-11-26T11:51:58.000-06:00
diff --git a/comfy/quant_ops.py b/comfy/quant_ops.py
@@ -1020,7 +1020,7 @@ def int8_linear(func, args, kwargs):
         
         orig_dtype = input_tensor._layout_params['orig_dtype']
         out_dtype = kwargs.get('out_dtype', orig_dtype)
-        out_quant = kwargs.get('out_quant', True)  # Whether to return quantized output
+        out_quant = kwargs.get('out_quant', False)  # Whether to return quantized output
         
         # Weight is already in (N, K) format (standard PyTorch weight format)
         # Pass out_quant to _int8_gemm_triton_or_fallback for fused matmul+quant
@@ -1080,7 +1080,7 @@ def int8_mm(func, args, kwargs):
         
         orig_dtype = input_tensor._layout_params['orig_dtype']
         out_dtype = kwargs.get('out_dtype', orig_dtype)
-        out_quant = kwargs.get('out_quant', True)  # Whether to return quantized output (default: True)
+        out_quant = kwargs.get('out_quant', False)  # Whether to return quantized output (default: True)
         
         # Check if weight needs to be transposed to (N, K) format
         # For mm: input is (M, K), weight should be (N, K) for the kernel
@@ -1154,7 +1154,7 @@ def int8_addmm(func, args, kwargs):
         
         orig_dtype = input_tensor._layout_params['orig_dtype']
         out_dtype = kwargs.get('out_dtype', orig_dtype)
-        out_quant = kwargs.get('out_quant', True)  # Whether to return quantized output
+        out_quant = kwargs.get('out_quant', False)  # Whether to return quantized output
         
         # PyTorch's F.linear internally calls addmm(bias, input, weight.t())
         # So weight arrives in (K, N) format (transposed), need to transpose back to (N, K)