[Kernel] Disable CUTLASS kernels for fp8 (vllm-project#5505)

Temirulan · Jun 13, 2024 · a64d9c2 · a64d9c2
1 parent ddf97b7
commit a64d9c2
Showing 1 changed file with 3 additions and 1 deletion.
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -257,7 +257,9 @@ def apply(self,
         #   If dynamic, layer.input_scale is None and x_scale computed from x.
         #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
-        if bias is None and self.cutlass_fp8_supported:
+        # Temporarily disable CUTLASS kernels due to an illegal memory access
+        #if  bias is None and self.cutlass_fp8_supported:
+        if False:
             qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ