From 355a6eadba771a8fc153bb103d489bc7ae2e05fa Mon Sep 17 00:00:00 2001 From: Vik Korrapati Date: Sat, 20 Apr 2024 16:42:27 -0700 Subject: [PATCH 1/2] [Model] Add moondream vision language model --- README.md | 1 + examples/moondream_example.py | 49 ++++ vllm/model_executor/model_loader/loader.py | 5 +- vllm/model_executor/models/__init__.py | 4 +- vllm/model_executor/models/moondream.py | 266 +++++++++++++++++++++ vllm/model_executor/models/phi.py | 16 +- 6 files changed, 331 insertions(+), 10 deletions(-) create mode 100644 examples/moondream_example.py create mode 100644 vllm/model_executor/models/moondream.py diff --git a/README.md b/README.md index 947d50d4ad76..fbbb3be60236 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.) - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.) +- Moondream (`vikhyatk/moondream2`, `vikhyatk/moondream1`, etc.) - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) - OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) diff --git a/examples/moondream_example.py b/examples/moondream_example.py new file mode 100644 index 000000000000..07b3d628d9ae --- /dev/null +++ b/examples/moondream_example.py @@ -0,0 +1,49 @@ +import torch +from torchvision.transforms.v2 import ( + Compose, + Resize, + InterpolationMode, + ToImage, + ToDtype, + Normalize, +) +from PIL import Image + +from vllm import LLM, SamplingParams +from vllm.sequence import MultiModalData + +if __name__ == "__main__": + + sampling_params = SamplingParams(temperature=0, max_tokens=256) + llm = LLM( + model="vikhyatk/moondream2", + trust_remote_code=True, + image_input_type="pixel_values", + image_token_id=50256, + image_input_shape="1,3,378,378", + image_feature_size=729, + ) + + preprocess = Compose( + [ + Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC), + ToImage(), + ToDtype(torch.float32, scale=True), + Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + ] + ) + + image = Image.open("docs/source/assets/kernel/value.png").convert("RGB") + image_pixels = preprocess(image).unsqueeze(0) + + outputs = llm.generate( + [("<|endoftext|>" * 729) + "\n\nQuestion: Describe this image.\n\nAnswer:"], + multi_modal_data=MultiModalData( + type=MultiModalData.Type.IMAGE, data=image_pixels + ), + sampling_params=sampling_params, + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 6c8cb2935f37..34646a19c1c2 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -23,13 +23,12 @@ get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.llava import LlavaForConditionalGeneration +from vllm.model_executor.models.moondream import Moondream if TYPE_CHECKING: from vllm.model_executor.layers.linear import LinearMethodBase -_VISION_MODEL_CLASSES = [ - LlavaForConditionalGeneration, -] +_VISION_MODEL_CLASSES = [LlavaForConditionalGeneration, Moondream] logger = init_logger(__name__) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 17fc97056804..a03885e1622b 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -31,8 +31,8 @@ "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - "LlavaForConditionalGeneration": - ("llava", "LlavaForConditionalGeneration"), + "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), + "Moondream": ("moondream", "Moondream"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), diff --git a/vllm/model_executor/models/moondream.py b/vllm/model_executor/models/moondream.py new file mode 100644 index 000000000000..439f203db446 --- /dev/null +++ b/vllm/model_executor/models/moondream.py @@ -0,0 +1,266 @@ +from typing import Optional, Iterable, Tuple, List + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import VisionLanguageConfig +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.models.phi import PhiForCausalLM +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + + +class Attention(nn.Module): + def __init__(self, dim, num_heads=16): + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.qkv = nn.Linear(dim, dim * 3) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = ( + self.qkv(x) + .reshape(B, N, 3, self.num_heads, self.head_dim) + .permute(2, 0, 3, 1, 4) + ) + q, k, v = qkv.unbind(0) + + # TODO: Replace with VLLM attention implementation after adding support + # for acasual attention. + x = F.scaled_dot_product_attention(q, k, v) + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + return x + + +class VitBlock(nn.Module): + def __init__(self, embed_dim): + super().__init__() + self.attn = Attention(embed_dim) + self.mlp = MLP(embed_dim, 4304) + self.norm1 = nn.LayerNorm(embed_dim) + self.norm2 = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = x + self.attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + return x + + +class VisionTransformer(nn.Module): + def __init__(self): + super().__init__() + + embed_len = 729 + embed_dim = 1152 + + self.patch_embed = LinearPatchEmbedding() + self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02) + self.blocks = nn.Sequential(*[VitBlock(embed_dim) for _ in range(27)]) + self.norm = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + for block in self.blocks: + x = block(x) + return self.norm(x) + + +class EncoderWrapper(nn.Module): + def __init__(self): + super().__init__() + self.model = nn.ModuleDict({"visual": VisionTransformer()}) + + def forward(self, x): + return self.model["visual"](x) + + +class LinearPatchEmbedding(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(588, 1152) + + def forward(self, x): + b, c, hp1, wp2 = x.shape + p1, p2 = 14, 14 + h, w = hp1 // p1, wp2 // p2 + x = x.reshape(b, c, h, p1, w, p2) + x = x.permute(0, 2, 4, 1, 3, 5) + x = x.reshape(b, h * w, c * p1 * p2) + + return self.linear(x) + + +class MLP(nn.Module): + def __init__( + self, + in_features: int, + hidden_features: int = None, + out_features: int = None, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = nn.GELU(approximate="tanh") + self.fc2 = nn.Linear(hidden_features, out_features) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +class VisionProjection(nn.Module): + def __init__(self): + super().__init__() + + image_embedding_dim = 1152 + model_dim = 2048 + hidden_dim = model_dim * 4 + + self.mlp = MLP(image_embedding_dim, hidden_dim, model_dim) + + def forward(self, x): + return self.mlp(x) + + +class VisionEncoder(nn.Module): + def __init__(self): + super().__init__() + self.encoder = EncoderWrapper() + self.projection = VisionProjection() + + def forward(self, x) -> torch.Tensor: + x = self.encoder(x) + x = self.projection(x) + return x + + +class Moondream(nn.Module): + def __init__( + self, + config: PretrainedConfig, + vision_language_config: VisionLanguageConfig, + linear_method: Optional["LinearMethodBase"] = None, + ) -> None: + super().__init__() + self.config = config + + self.vision_language_config = vision_language_config + + assert self.vision_language_config, ( + "Provide `image_input_type` and other vision " + "related configurations through LLM entrypoint " + "or engine arguments." + ) + + if self.vision_language_config.image_input_type == ( + VisionLanguageConfig.ImageInputType.PIXEL_VALUES + ): + self.vision_encoder = VisionEncoder() + else: + self.vision_encoder = None + + self.linear_method = linear_method + + self.text_model = PhiForCausalLM(config.text_config) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + image_input: Optional[torch.Tensor] = None, + ) -> SamplerOutput: + if image_input is not None: + if list(image_input.shape[1:]) != list( + self.vision_language_config.image_input_shape[1:] + ): + raise ValueError( + f"The expected image tensor shape is batch dimension " + f"plus " + f"{self.vision_language_config.image_input_shape[1:]}." + f" You supplied {image_input.shape}. " + f"If you are using vLLM's entrypoint, make sure your " + f"supplied image input is consistent with " + f"image_input_shape in engine args." + ) + + if self.vision_encoder is not None: + image_features = self.vision_encoder(image_input) + else: + image_features = image_input + + inputs_embeds = self.text_model.model.embed_tokens(input_ids) + mask = input_ids == self.vision_language_config.image_token_id + inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + else: + inputs_embeds = None + + hidden_states = self.text_model( + input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds + ) + return hidden_states + + def compute_logits( + self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata + ) -> torch.Tensor: + return self.text_model.compute_logits(hidden_states, sampling_metadata) + + def sample( + self, logits: torch.Tensor, sampling_metadata: SamplingMetadata + ) -> Optional[SamplerOutput]: + return self.text_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + + params_map = { + "text_model.transformer.embd.wte.weight": "text_model.model.embed_tokens.weight", + "text_model.lm_head.linear.weight": "text_model.lm_head.weight", + "text_model.lm_head.linear.bias": "text_model.lm_head.bias", + "text_model.lm_head.ln.weight": "text_model.model.final_layernorm.weight", + "text_model.lm_head.ln.bias": "text_model.model.final_layernorm.bias", + } + + for name, loaded_weight in weights: + param = None + + if name in params_map: + param = params_dict[params_map[name]] + elif name in params_dict: + param = params_dict[name] + elif name.startswith("text_model."): + replacements = { + "text_model.transformer.h": "text_model.model.layers", + "ln": "input_layernorm", + "mixer.Wqkv": "self_attn.qkv_proj", + "mixer.out_proj": "self_attn.dense", + } + + mp = name + for k, v in replacements.items(): + if k in mp: + mp = mp.replace(k, v) + if mp in params_dict: + param = params_dict[mp] + + if param is None: + raise ValueError(f"Unmapped weight: {name}") + else: + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index f974b78a0fbd..7d4793677131 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -205,8 +205,13 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_tokens(input_ids) + for i in range(self.config.num_hidden_layers): layer = self.layers[i] hidden_states = layer( @@ -244,9 +249,11 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + hidden_states = self.model( + input_ids, positions, kv_caches, attn_metadata, inputs_embeds + ) return hidden_states @@ -295,6 +302,5 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # pylint: disable=E1136 param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) + weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) From dd0e00809ca211a672da7d1a50abbf3e5b5a68e7 Mon Sep 17 00:00:00 2001 From: Vik Korrapati Date: Sat, 20 Apr 2024 17:08:15 -0700 Subject: [PATCH 2/2] fix formatting --- examples/moondream_example.py | 32 ++++------- vllm/model_executor/models/__init__.py | 3 +- vllm/model_executor/models/moondream.py | 76 +++++++++++++------------ vllm/model_executor/models/phi.py | 8 +-- 4 files changed, 58 insertions(+), 61 deletions(-) diff --git a/examples/moondream_example.py b/examples/moondream_example.py index 07b3d628d9ae..f2dcb9595474 100644 --- a/examples/moondream_example.py +++ b/examples/moondream_example.py @@ -1,13 +1,7 @@ import torch -from torchvision.transforms.v2 import ( - Compose, - Resize, - InterpolationMode, - ToImage, - ToDtype, - Normalize, -) from PIL import Image +from torchvision.transforms.v2 import (Compose, InterpolationMode, Normalize, + Resize, ToDtype, ToImage) from vllm import LLM, SamplingParams from vllm.sequence import MultiModalData @@ -24,23 +18,21 @@ image_feature_size=729, ) - preprocess = Compose( - [ - Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC), - ToImage(), - ToDtype(torch.float32, scale=True), - Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), - ] - ) + preprocess = Compose([ + Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC), + ToImage(), + ToDtype(torch.float32, scale=True), + Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + ]) image = Image.open("docs/source/assets/kernel/value.png").convert("RGB") image_pixels = preprocess(image).unsqueeze(0) outputs = llm.generate( - [("<|endoftext|>" * 729) + "\n\nQuestion: Describe this image.\n\nAnswer:"], - multi_modal_data=MultiModalData( - type=MultiModalData.Type.IMAGE, data=image_pixels - ), + [("<|endoftext|>" * 729) + + "\n\nQuestion: Describe this image.\n\nAnswer:"], + multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE, + data=image_pixels), sampling_params=sampling_params, ) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index a03885e1622b..dec1c6385128 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -31,7 +31,8 @@ "InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"), "JAISLMHeadModel": ("jais", "JAISLMHeadModel"), "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), - "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), + "LlavaForConditionalGeneration": + ("llava", "LlavaForConditionalGeneration"), "Moondream": ("moondream", "Moondream"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), diff --git a/vllm/model_executor/models/moondream.py b/vllm/model_executor/models/moondream.py index 439f203db446..ea27341410eb 100644 --- a/vllm/model_executor/models/moondream.py +++ b/vllm/model_executor/models/moondream.py @@ -1,4 +1,4 @@ -from typing import Optional, Iterable, Tuple, List +from typing import Iterable, List, Optional, Tuple import torch import torch.nn as nn @@ -8,13 +8,14 @@ from vllm.attention import AttentionMetadata from vllm.config import VisionLanguageConfig from vllm.model_executor.layers.linear import LinearMethodBase -from vllm.model_executor.models.phi import PhiForCausalLM from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.phi import PhiForCausalLM from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import SamplerOutput class Attention(nn.Module): + def __init__(self, dim, num_heads=16): super().__init__() assert dim % num_heads == 0, "dim should be divisible by num_heads" @@ -27,11 +28,8 @@ def __init__(self, dim, num_heads=16): def forward(self, x: torch.Tensor) -> torch.Tensor: B, N, C = x.shape - qkv = ( - self.qkv(x) - .reshape(B, N, 3, self.num_heads, self.head_dim) - .permute(2, 0, 3, 1, 4) - ) + qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads, + self.head_dim).permute(2, 0, 3, 1, 4)) q, k, v = qkv.unbind(0) # TODO: Replace with VLLM attention implementation after adding support @@ -44,6 +42,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class VitBlock(nn.Module): + def __init__(self, embed_dim): super().__init__() self.attn = Attention(embed_dim) @@ -58,6 +57,7 @@ def forward(self, x): class VisionTransformer(nn.Module): + def __init__(self): super().__init__() @@ -65,7 +65,7 @@ def __init__(self): embed_dim = 1152 self.patch_embed = LinearPatchEmbedding() - self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02) + self.pos_embed = nn.Parameter(torch.zeros(1, embed_len, embed_dim)) self.blocks = nn.Sequential(*[VitBlock(embed_dim) for _ in range(27)]) self.norm = nn.LayerNorm(embed_dim) @@ -78,6 +78,7 @@ def forward(self, x): class EncoderWrapper(nn.Module): + def __init__(self): super().__init__() self.model = nn.ModuleDict({"visual": VisionTransformer()}) @@ -87,6 +88,7 @@ def forward(self, x): class LinearPatchEmbedding(nn.Module): + def __init__(self): super().__init__() self.linear = nn.Linear(588, 1152) @@ -103,6 +105,7 @@ def forward(self, x): class MLP(nn.Module): + def __init__( self, in_features: int, @@ -124,6 +127,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class VisionProjection(nn.Module): + def __init__(self): super().__init__() @@ -138,6 +142,7 @@ def forward(self, x): class VisionEncoder(nn.Module): + def __init__(self): super().__init__() self.encoder = EncoderWrapper() @@ -150,6 +155,7 @@ def forward(self, x) -> torch.Tensor: class Moondream(nn.Module): + def __init__( self, config: PretrainedConfig, @@ -164,12 +170,10 @@ def __init__( assert self.vision_language_config, ( "Provide `image_input_type` and other vision " "related configurations through LLM entrypoint " - "or engine arguments." - ) + "or engine arguments.") if self.vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.PIXEL_VALUES - ): + VisionLanguageConfig.ImageInputType.PIXEL_VALUES): self.vision_encoder = VisionEncoder() else: self.vision_encoder = None @@ -188,8 +192,7 @@ def forward( ) -> SamplerOutput: if image_input is not None: if list(image_input.shape[1:]) != list( - self.vision_language_config.image_input_shape[1:] - ): + self.vision_language_config.image_input_shape[1:]): raise ValueError( f"The expected image tensor shape is batch dimension " f"plus " @@ -197,8 +200,7 @@ def forward( f" You supplied {image_input.shape}. " f"If you are using vLLM's entrypoint, make sure your " f"supplied image input is consistent with " - f"image_input_shape in engine args." - ) + f"image_input_shape in engine args.") if self.vision_encoder is not None: image_features = self.vision_encoder(image_input) @@ -207,49 +209,47 @@ def forward( inputs_embeds = self.text_model.model.embed_tokens(input_ids) mask = input_ids == self.vision_language_config.image_token_id - inputs_embeds[mask] = image_features.view(-1, image_features.shape[-1]) + inputs_embeds[mask] = image_features.view( + -1, + image_features.shape[-1], + ) else: inputs_embeds = None hidden_states = self.text_model( - input_ids, positions, kv_caches, attn_metadata, inputs_embeds=inputs_embeds + input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds, ) return hidden_states - def compute_logits( - self, hidden_states: torch.Tensor, sampling_metadata: SamplingMetadata - ) -> torch.Tensor: + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: return self.text_model.compute_logits(hidden_states, sampling_metadata) - def sample( - self, logits: torch.Tensor, sampling_metadata: SamplingMetadata - ) -> Optional[SamplerOutput]: + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: return self.text_model.sample(logits, sampling_metadata) def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): params_dict = dict(self.named_parameters()) - params_map = { - "text_model.transformer.embd.wte.weight": "text_model.model.embed_tokens.weight", - "text_model.lm_head.linear.weight": "text_model.lm_head.weight", - "text_model.lm_head.linear.bias": "text_model.lm_head.bias", - "text_model.lm_head.ln.weight": "text_model.model.final_layernorm.weight", - "text_model.lm_head.ln.bias": "text_model.model.final_layernorm.bias", - } - for name, loaded_weight in weights: param = None - if name in params_map: - param = params_dict[params_map[name]] - elif name in params_dict: + if name in params_dict: param = params_dict[name] elif name.startswith("text_model."): replacements = { "text_model.transformer.h": "text_model.model.layers", + "lm_head.ln": "model.final_layernorm", "ln": "input_layernorm", "mixer.Wqkv": "self_attn.qkv_proj", "mixer.out_proj": "self_attn.dense", + "lm_head.linear": "lm_head", + "transformer.embd.wte": "model.embed_tokens", } mp = name @@ -262,5 +262,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if param is None: raise ValueError(f"Unmapped weight: {name}") else: - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr( + param, + "weight_loader", + default_weight_loader, + ) weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index 7d4793677131..2c93e557ae4c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -251,9 +251,8 @@ def forward( attn_metadata: AttentionMetadata, inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.model( - input_ids, positions, kv_caches, attn_metadata, inputs_embeds - ) + hidden_states = self.model(input_ids, positions, kv_caches, + attn_metadata, inputs_embeds) return hidden_states @@ -302,5 +301,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # pylint: disable=E1136 param = params_dict[name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader = getattr(param, "weight_loader", + default_weight_loader) weight_loader(param, loaded_weight)