more clean-up

neuralmagic · Oct 1, 2024 · 79126f9 · 79126f9
1 parent bbf575e
commit 79126f9
Show file tree

Hide file tree

Showing 4 changed files with 2 additions and 14 deletions.
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
@@ -169,4 +169,4 @@ def apply(self,
                                pack_factor)
         if bias is not None:
             out.add_(bias)
-        return out.reshape(out_shape)
+        return out.reshape(out_shape)
diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py
@@ -509,7 +509,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
         )
         replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
         # Repack scales
-        # Why does this take the intermediate size for size_k?
         marlin_w13_scales = marlin_moe_permute_scales(
             s=layer.w13_scales,
             size_k=layer.intermediate_size_per_partition,

diff --git a/vllm/model_executor/layers/quantization/utils/marlin_utils.py b/vllm/model_executor/layers/quantization/utils/marlin_utils.py
@@ -273,17 +273,6 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
     return output
 
 
-# Newly generated tensors need to replace existing tensors that are
-# already registered as parameters by vLLM (and won't be freed)
-def replace_tensor(layer: torch.nn.Module, name: str,
-                   new_t: torch.Tensor) -> None:
-    # It is important to use resize_() here since it ensures
-    # the same buffer is reused
-    getattr(layer, name).resize_(new_t.shape)
-    getattr(layer, name).copy_(new_t)
-    del new_t
-
-
 def apply_gptq_marlin_linear(
         input: torch.Tensor,
         weight: torch.Tensor,

diff --git a/vllm/model_executor/model_loader/utils.py b/vllm/model_executor/model_loader/utils.py
@@ -24,7 +24,7 @@ def get_model_architecture(
     # Special handling for quantized Mixtral.
     # FIXME(woosuk): This is a temporary hack.
     mixtral_supported = [
-        "fp8", "compressed-tensors", "gptq_marlin", "awq", "awq_marlin"
+        "fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
     ]
 
     if (model_config.quantization is not None