ModelCloud · Qubitium · Jan 29, 2025 · Jan 29, 2025 · Jan 29, 2025
diff --git a/gptqmodel/nn_modules/qlinear/dynamic_cuda.py b/gptqmodel/nn_modules/qlinear/dynamic_cuda.py
@@ -61,7 +61,6 @@ def __init__(
             infeatures: int,
             outfeatures: int,
             bias: bool,
-            weight_dtype=torch.float16,
             kernel_switch_threshold=128,
             **kwargs,
     ):
@@ -70,7 +69,7 @@ def __init__(
                 f"Trying to use the cuda backend, but could not import the C++/CUDA dependencies with the following error: {gptqmodel_cuda_import_exception}"
             )
         super().__init__(bits=bits, group_size=group_size, sym=sym, desc_act=desc_act, infeatures=infeatures,
-                         outfeatures=outfeatures, bias=bias, weight_dtype=weight_dtype, **kwargs)
+                         outfeatures=outfeatures, bias=bias, **kwargs)
 
         self.kernel_switch_threshold = kernel_switch_threshold
 
@@ -104,43 +103,43 @@ def forward(self, x: torch.Tensor):
         out = torch.zeros((x.shape[0], self.outfeatures), device=x.device, dtype=torch.float32)
         if self.bits == 2:
             self.gptqmodel_cuda.vecquant2matmul(
-                x.float(),
+                x.float32(),
                 self.qweight,
                 out,
-                self.scales.float(),
+                self.scales.float32(),
                 self.qzeros,
                 self.g_idx,
             )
         elif self.bits == 3:
             self.gptqmodel_cuda.vecquant3matmul(
-                x.float(),
+                x.float32(),
                 self.qweight,
                 out,
-                self.scales.float(),
+                self.scales.float32(),
                 self.qzeros,
                 self.g_idx,
             )
         elif self.bits == 4:
             self.gptqmodel_cuda.vecquant4matmul(
-                x.float(),
+                x.float32(),
                 self.qweight,
                 out,
-                self.scales.float(),
+                self.scales.float32(),
                 self.qzeros,
                 self.g_idx,
             )
         elif self.bits == 8:
             self.gptqmodel_cuda.vecquant8matmul(
-                x.float(),
+                x.float32(),
                 self.qweight,
                 out,
-                self.scales.float(),
+                self.scales.float32(),
                 self.qzeros,
                 self.g_idx,
             )
-        out = out.to(x_dtype)
-        out = out.reshape(out_shape)
-        out = out + self.bias if self.bias is not None else out
+        out = out.to(x_dtype).reshape(out_shape)
+        if self.bias is not None:
+            out = out + self.bias
         return out
 
 

diff --git a/gptqmodel/nn_modules/qlinear/tritonv2.py b/gptqmodel/nn_modules/qlinear/tritonv2.py
@@ -140,9 +140,8 @@ def forward(self, x):
             x = F.pad(x, (0, self.padded_infeatures - self.infeatures))
 
         out_shape = x.shape[:-1] + (self.outfeatures,)
-        quant_linear_fn = QuantLinearFunction
 
-        out = quant_linear_fn.apply(
+        out = QuantLinearFunction.apply(
             x.reshape(-1, x.shape[-1]),
             self.qweight,
             self.scales,
@@ -152,15 +151,16 @@ def forward(self, x):
             self.pack_dtype_bits,
             self.maxq,
         )
-        out = out.half().reshape(out_shape)
-        out = out + self.bias if self.bias is not None else out
+        out = out.to(dtype=x.dtype).reshape(out_shape)
+        if self.bias is not None:
+            out = out + self.bias
         return out
 
 
 __all__ = ["TritonV2QuantLinear"]
 
-
-def add(x: torch.Tensor, y: torch.Tensor):
+# test triton on XPU to ensure special Intel/Triton is installed as we cannot check based on triton package meta data
+def triton_test_add(x: torch.Tensor, y: torch.Tensor):
     # don't put it on top-level to avoid crash if triton was not installed
     @triton.jit
     def add_kernel(x_ptr,  # *Pointer* to first input vector.
@@ -192,7 +192,7 @@ def triton_xpu_available():
     y = torch.rand(size, device='xpu:0')
 
     try:
-        add(x, y)
+        triton_test_add(x, y)
         return True
     except Exception:
         return False

diff --git a/gptqmodel/nn_modules/triton_utils/dequant.py b/gptqmodel/nn_modules/triton_utils/dequant.py
@@ -72,8 +72,7 @@ def dequant_kernel(
     # tl.device_assert(g_idx >= 0, "index out of bounds: 0 <= tmp0 < 0")
     groups = tl.where(tmp2, tmp1, g_idx)  # tmp3 are g_idx
 
-    # TODO: why is triton upscaling dequantized weights to fp32?
-    scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float16)
+    scales = tl.load(scales_ptr + (col_idx + (outfeatures * groups)), None).to(tl.float32)
 
     # Unpack weights
     weights = (qweights >> wf_weights) & maxq  # bit shift qweight
@@ -88,10 +87,7 @@ def dequant_kernel(
     zeros = (qzeros >> wf_zeros) & maxq
 
     # Dequantize
-    weights = weights - zeros
-    # TODO: why is triton upscaling dequantized weights to fp32?
-    weights = weights.to(tl.float16)
-    weights = scales * weights
+    weights = (weights - zeros).to(tl.float32) * scales
 
     tl.store(out_ptr + (x_index), weights, mask=xmask)