From 159ccf1ada3a8f74bfb1edb764a53fdd5a05722c Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Thu, 13 Jun 2024 16:38:05 -0400 Subject: [PATCH] [Kernel] Disable CUTLASS kernels for fp8 (#5505) --- vllm/model_executor/layers/quantization/fp8.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index e89fd65813c0..bc08bfcc32b3 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -257,7 +257,9 @@ def apply(self, # If dynamic, layer.input_scale is None and x_scale computed from x. # If static, layer.input_scale is scalar and x_scale is input_scale. - if bias is None and self.cutlass_fp8_supported: + # Temporarily disable CUTLASS kernels due to an illegal memory access + #if bias is None and self.cutlass_fp8_supported: + if False: qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale) # Fused GEMM_DQ