Skip to content

Commit

Permalink
more clean-up
Browse files Browse the repository at this point in the history
  • Loading branch information
dsikka committed Oct 1, 2024
1 parent bbf575e commit 79126f9
Show file tree
Hide file tree
Showing 4 changed files with 2 additions and 14 deletions.
2 changes: 1 addition & 1 deletion vllm/model_executor/layers/quantization/awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,4 +169,4 @@ def apply(self,
pack_factor)
if bias is not None:
out.add_(bias)
return out.reshape(out_shape)
return out.reshape(out_shape)
1 change: 0 additions & 1 deletion vllm/model_executor/layers/quantization/gptq_marlin.py
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
)
replace_parameter(layer, "w2_qweight", marlin_w2_qweight)
# Repack scales
# Why does this take the intermediate size for size_k?
marlin_w13_scales = marlin_moe_permute_scales(
s=layer.w13_scales,
size_k=layer.intermediate_size_per_partition,
Expand Down
11 changes: 0 additions & 11 deletions vllm/model_executor/layers/quantization/utils/marlin_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,17 +273,6 @@ def moe_awq_to_marlin_zero_points(q_zp_packed: torch.Tensor, size_k: int,
return output


# Newly generated tensors need to replace existing tensors that are
# already registered as parameters by vLLM (and won't be freed)
def replace_tensor(layer: torch.nn.Module, name: str,
new_t: torch.Tensor) -> None:
# It is important to use resize_() here since it ensures
# the same buffer is reused
getattr(layer, name).resize_(new_t.shape)
getattr(layer, name).copy_(new_t)
del new_t


def apply_gptq_marlin_linear(
input: torch.Tensor,
weight: torch.Tensor,
Expand Down
2 changes: 1 addition & 1 deletion vllm/model_executor/model_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def get_model_architecture(
# Special handling for quantized Mixtral.
# FIXME(woosuk): This is a temporary hack.
mixtral_supported = [
"fp8", "compressed-tensors", "gptq_marlin", "awq", "awq_marlin"
"fp8", "compressed-tensors", "gptq_marlin", "awq_marlin"
]

if (model_config.quantization is not None
Expand Down

0 comments on commit 79126f9

Please sign in to comment.