diff --git a/README.md b/README.md index 947d50d4ad76..fbbb3be60236 100644 --- a/README.md +++ b/README.md @@ -73,6 +73,7 @@ vLLM seamlessly supports many Hugging Face models, including the following archi - MiniCPM (`openbmb/MiniCPM-2B-sft-bf16`, `openbmb/MiniCPM-2B-dpo-bf16`, etc.) - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) - Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, `mistral-community/Mixtral-8x22B-v0.1`, etc.) +- Moondream (`vikhyatk/moondream2`, `vikhyatk/moondream1`, etc.) - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) - OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.) - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) diff --git a/examples/moondream_example.py b/examples/moondream_example.py new file mode 100644 index 000000000000..f2dcb9595474 --- /dev/null +++ b/examples/moondream_example.py @@ -0,0 +1,41 @@ +import torch +from PIL import Image +from torchvision.transforms.v2 import (Compose, InterpolationMode, Normalize, + Resize, ToDtype, ToImage) + +from vllm import LLM, SamplingParams +from vllm.sequence import MultiModalData + +if __name__ == "__main__": + + sampling_params = SamplingParams(temperature=0, max_tokens=256) + llm = LLM( + model="vikhyatk/moondream2", + trust_remote_code=True, + image_input_type="pixel_values", + image_token_id=50256, + image_input_shape="1,3,378,378", + image_feature_size=729, + ) + + preprocess = Compose([ + Resize(size=(378, 378), interpolation=InterpolationMode.BICUBIC), + ToImage(), + ToDtype(torch.float32, scale=True), + Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]), + ]) + + image = Image.open("docs/source/assets/kernel/value.png").convert("RGB") + image_pixels = preprocess(image).unsqueeze(0) + + outputs = llm.generate( + [("<|endoftext|>" * 729) + + "\n\nQuestion: Describe this image.\n\nAnswer:"], + multi_modal_data=MultiModalData(type=MultiModalData.Type.IMAGE, + data=image_pixels), + sampling_params=sampling_params, + ) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 6c8cb2935f37..34646a19c1c2 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -23,13 +23,12 @@ get_quant_config, initialize_dummy_weights, np_cache_weights_iterator, pt_weights_iterator, safetensors_weights_iterator) from vllm.model_executor.models.llava import LlavaForConditionalGeneration +from vllm.model_executor.models.moondream import Moondream if TYPE_CHECKING: from vllm.model_executor.layers.linear import LinearMethodBase -_VISION_MODEL_CLASSES = [ - LlavaForConditionalGeneration, -] +_VISION_MODEL_CLASSES = [LlavaForConditionalGeneration, Moondream] logger = init_logger(__name__) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 17fc97056804..dec1c6385128 100755 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -33,6 +33,7 @@ "LlamaForCausalLM": ("llama", "LlamaForCausalLM"), "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), + "Moondream": ("moondream", "Moondream"), # For decapoda-research/llama-* "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"), "MistralForCausalLM": ("llama", "LlamaForCausalLM"), diff --git a/vllm/model_executor/models/moondream.py b/vllm/model_executor/models/moondream.py new file mode 100644 index 000000000000..ea27341410eb --- /dev/null +++ b/vllm/model_executor/models/moondream.py @@ -0,0 +1,270 @@ +from typing import Iterable, List, Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from transformers import PretrainedConfig + +from vllm.attention import AttentionMetadata +from vllm.config import VisionLanguageConfig +from vllm.model_executor.layers.linear import LinearMethodBase +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.phi import PhiForCausalLM +from vllm.model_executor.sampling_metadata import SamplingMetadata +from vllm.sequence import SamplerOutput + + +class Attention(nn.Module): + + def __init__(self, dim, num_heads=16): + super().__init__() + assert dim % num_heads == 0, "dim should be divisible by num_heads" + + self.num_heads = num_heads + self.head_dim = dim // num_heads + + self.qkv = nn.Linear(dim, dim * 3) + self.proj = nn.Linear(dim, dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, N, C = x.shape + qkv = (self.qkv(x).reshape(B, N, 3, self.num_heads, + self.head_dim).permute(2, 0, 3, 1, 4)) + q, k, v = qkv.unbind(0) + + # TODO: Replace with VLLM attention implementation after adding support + # for acasual attention. + x = F.scaled_dot_product_attention(q, k, v) + + x = x.transpose(1, 2).reshape(B, N, C) + x = self.proj(x) + return x + + +class VitBlock(nn.Module): + + def __init__(self, embed_dim): + super().__init__() + self.attn = Attention(embed_dim) + self.mlp = MLP(embed_dim, 4304) + self.norm1 = nn.LayerNorm(embed_dim) + self.norm2 = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = x + self.attn(self.norm1(x)) + x = x + self.mlp(self.norm2(x)) + return x + + +class VisionTransformer(nn.Module): + + def __init__(self): + super().__init__() + + embed_len = 729 + embed_dim = 1152 + + self.patch_embed = LinearPatchEmbedding() + self.pos_embed = nn.Parameter(torch.zeros(1, embed_len, embed_dim)) + self.blocks = nn.Sequential(*[VitBlock(embed_dim) for _ in range(27)]) + self.norm = nn.LayerNorm(embed_dim) + + def forward(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + for block in self.blocks: + x = block(x) + return self.norm(x) + + +class EncoderWrapper(nn.Module): + + def __init__(self): + super().__init__() + self.model = nn.ModuleDict({"visual": VisionTransformer()}) + + def forward(self, x): + return self.model["visual"](x) + + +class LinearPatchEmbedding(nn.Module): + + def __init__(self): + super().__init__() + self.linear = nn.Linear(588, 1152) + + def forward(self, x): + b, c, hp1, wp2 = x.shape + p1, p2 = 14, 14 + h, w = hp1 // p1, wp2 // p2 + x = x.reshape(b, c, h, p1, w, p2) + x = x.permute(0, 2, 4, 1, 3, 5) + x = x.reshape(b, h * w, c * p1 * p2) + + return self.linear(x) + + +class MLP(nn.Module): + + def __init__( + self, + in_features: int, + hidden_features: int = None, + out_features: int = None, + ) -> None: + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = nn.GELU(approximate="tanh") + self.fc2 = nn.Linear(hidden_features, out_features) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.act(x) + x = self.fc2(x) + return x + + +class VisionProjection(nn.Module): + + def __init__(self): + super().__init__() + + image_embedding_dim = 1152 + model_dim = 2048 + hidden_dim = model_dim * 4 + + self.mlp = MLP(image_embedding_dim, hidden_dim, model_dim) + + def forward(self, x): + return self.mlp(x) + + +class VisionEncoder(nn.Module): + + def __init__(self): + super().__init__() + self.encoder = EncoderWrapper() + self.projection = VisionProjection() + + def forward(self, x) -> torch.Tensor: + x = self.encoder(x) + x = self.projection(x) + return x + + +class Moondream(nn.Module): + + def __init__( + self, + config: PretrainedConfig, + vision_language_config: VisionLanguageConfig, + linear_method: Optional["LinearMethodBase"] = None, + ) -> None: + super().__init__() + self.config = config + + self.vision_language_config = vision_language_config + + assert self.vision_language_config, ( + "Provide `image_input_type` and other vision " + "related configurations through LLM entrypoint " + "or engine arguments.") + + if self.vision_language_config.image_input_type == ( + VisionLanguageConfig.ImageInputType.PIXEL_VALUES): + self.vision_encoder = VisionEncoder() + else: + self.vision_encoder = None + + self.linear_method = linear_method + + self.text_model = PhiForCausalLM(config.text_config) + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + image_input: Optional[torch.Tensor] = None, + ) -> SamplerOutput: + if image_input is not None: + if list(image_input.shape[1:]) != list( + self.vision_language_config.image_input_shape[1:]): + raise ValueError( + f"The expected image tensor shape is batch dimension " + f"plus " + f"{self.vision_language_config.image_input_shape[1:]}." + f" You supplied {image_input.shape}. " + f"If you are using vLLM's entrypoint, make sure your " + f"supplied image input is consistent with " + f"image_input_shape in engine args.") + + if self.vision_encoder is not None: + image_features = self.vision_encoder(image_input) + else: + image_features = image_input + + inputs_embeds = self.text_model.model.embed_tokens(input_ids) + mask = input_ids == self.vision_language_config.image_token_id + inputs_embeds[mask] = image_features.view( + -1, + image_features.shape[-1], + ) + else: + inputs_embeds = None + + hidden_states = self.text_model( + input_ids, + positions, + kv_caches, + attn_metadata, + inputs_embeds=inputs_embeds, + ) + return hidden_states + + def compute_logits(self, hidden_states: torch.Tensor, + sampling_metadata: SamplingMetadata) -> torch.Tensor: + return self.text_model.compute_logits(hidden_states, sampling_metadata) + + def sample(self, logits: torch.Tensor, + sampling_metadata: SamplingMetadata) -> Optional[SamplerOutput]: + return self.text_model.sample(logits, sampling_metadata) + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + + for name, loaded_weight in weights: + param = None + + if name in params_dict: + param = params_dict[name] + elif name.startswith("text_model."): + replacements = { + "text_model.transformer.h": "text_model.model.layers", + "lm_head.ln": "model.final_layernorm", + "ln": "input_layernorm", + "mixer.Wqkv": "self_attn.qkv_proj", + "mixer.out_proj": "self_attn.dense", + "lm_head.linear": "lm_head", + "transformer.embd.wte": "model.embed_tokens", + } + + mp = name + for k, v in replacements.items(): + if k in mp: + mp = mp.replace(k, v) + if mp in params_dict: + param = params_dict[mp] + + if param is None: + raise ValueError(f"Unmapped weight: {name}") + else: + weight_loader = getattr( + param, + "weight_loader", + default_weight_loader, + ) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi.py b/vllm/model_executor/models/phi.py index f974b78a0fbd..2c93e557ae4c 100644 --- a/vllm/model_executor/models/phi.py +++ b/vllm/model_executor/models/phi.py @@ -205,8 +205,13 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: - hidden_states = self.embed_tokens(input_ids) + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.embed_tokens(input_ids) + for i in range(self.config.num_hidden_layers): layer = self.layers[i] hidden_states = layer( @@ -244,9 +249,10 @@ def forward( positions: torch.Tensor, kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, + inputs_embeds: Optional[torch.Tensor] = None, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, inputs_embeds) return hidden_states