diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py index 3ba15573c217..b26a3227e693 100644 --- a/vllm/model_executor/layers/vocab_parallel_embedding.py +++ b/vllm/model_executor/layers/vocab_parallel_embedding.py @@ -351,7 +351,10 @@ def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor): param.weight_type = loaded_weight.item() return elif isinstance(param, UninitializedParameter): - param.materialize(loaded_weight.shape, dtype=loaded_weight.dtype) + shape = list(loaded_weight.shape) + if output_dim is not None: + shape[output_dim] = shape[output_dim] // self.tp_size + param.materialize(tuple(shape), dtype=loaded_weight.dtype) # If parameter does not have output dim, then it should # be copied onto all gpus (e.g. g_idx for act_order gptq).