From 698921d39402d3a0d53513245dbb634683178085 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 15:24:01 +0000 Subject: [PATCH 01/27] Support NVLM-D --- docs/source/models/supported_models.rst | 5 ++++ vllm/model_executor/models/__init__.py | 36 ++++++++++--------------- vllm/transformers_utils/config.py | 1 + 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index c2e1c3721865..fb2358808e0d 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -275,6 +275,11 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - + * - :code:`NVLM_D` + - NVLM-D 1.0 + - Image\ :sup:`E+` + - :code:`nvidia/NVLM-D-72B`, etc. + - * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - Image\ :sup:`E` diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index 3a57db0d04fa..a48710845c31 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -60,8 +60,6 @@ "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), - "Qwen2VLForConditionalGeneration": - ("qwen2_vl", "Qwen2VLForConditionalGeneration"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), "StableLMEpochForCausalLM": ("stablelm", "StablelmForCausalLM"), "StableLmForCausalLM": ("stablelm", "StablelmForCausalLM"), @@ -79,34 +77,28 @@ "Qwen2ForRewardModel": ("qwen2_rm", "Qwen2ForRewardModel"), } +# yapf: disable _MULTIMODAL_MODELS = { - "Blip2ForConditionalGeneration": - ("blip2", "Blip2ForConditionalGeneration"), - "ChameleonForConditionalGeneration": - ("chameleon", "ChameleonForConditionalGeneration"), + "Blip2ForConditionalGeneration": ("blip2", "Blip2ForConditionalGeneration"), + "ChameleonForConditionalGeneration": ("chameleon", "ChameleonForConditionalGeneration"), # noqa: E501 "FuyuForCausalLM": ("fuyu", "FuyuForCausalLM"), "InternVLChatModel": ("internvl", "InternVLChatModel"), - "LlavaForConditionalGeneration": ("llava", - "LlavaForConditionalGeneration"), - "LlavaNextForConditionalGeneration": ("llava_next", - "LlavaNextForConditionalGeneration"), - "LlavaNextVideoForConditionalGeneration": - ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), - "LlavaOnevisionForConditionalGeneration": - ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + "LlavaForConditionalGeneration": ("llava", "LlavaForConditionalGeneration"), + "LlavaNextForConditionalGeneration": ("llava_next", "LlavaNextForConditionalGeneration"), # noqa: E501 + "LlavaNextVideoForConditionalGeneration": ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), # noqa: E501 + "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MiniCPMV": ("minicpmv", "MiniCPMV"), - "PaliGemmaForConditionalGeneration": ("paligemma", - "PaliGemmaForConditionalGeneration"), + "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 + "NVLM_D": ("nvlm_d", "InternVLChatModel"), + "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), - "PixtralForConditionalGeneration": ("pixtral", - "PixtralForConditionalGeneration"), + "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), - "Qwen2VLForConditionalGeneration": ("qwen2_vl", - "Qwen2VLForConditionalGeneration"), + "Qwen2VLForConditionalGeneration": ("qwen2_vl", "Qwen2VLForConditionalGeneration"), # noqa: E501 "UltravoxModel": ("ultravox", "UltravoxModel"), - "MllamaForConditionalGeneration": ("mllama", - "MllamaForConditionalGeneration"), } +# yapf: enable + _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"), "BartForConditionalGeneration": ("bart", "BartForConditionalGeneration"), diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 0f20e8d0c821..37366d14557a 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -55,6 +55,7 @@ "exaone": ExaoneConfig, "internvl_chat": InternVLChatConfig, "nemotron": NemotronConfig, + "NVLM_D": InternVLChatConfig, "solar": SolarConfig, "ultravox": UltravoxConfig, "qwen2_vl": Qwen2VLConfig, From 33f3a5081706f7673ee71f631763a1adaf049e95 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 16:48:26 +0000 Subject: [PATCH 02/27] Fix wrong module --- vllm/model_executor/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index a48710845c31..acd537473feb 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -89,7 +89,7 @@ "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MiniCPMV": ("minicpmv", "MiniCPMV"), "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 - "NVLM_D": ("nvlm_d", "InternVLChatModel"), + "NVLM_D": ("internvl", "InternVLChatModel"), "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 From 20ebb7571358d4934ef20518d03816bf9f8ceec4 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 17:27:06 +0000 Subject: [PATCH 03/27] Avoid warning when loading config --- vllm/transformers_utils/config.py | 8 ++++---- vllm/transformers_utils/configs/__init__.py | 2 ++ vllm/transformers_utils/configs/nvlm_d.py | 12 ++++++++++++ 3 files changed, 18 insertions(+), 4 deletions(-) create mode 100644 vllm/transformers_utils/configs/nvlm_d.py diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py index 37366d14557a..966cf354ed29 100644 --- a/vllm/transformers_utils/config.py +++ b/vllm/transformers_utils/config.py @@ -23,9 +23,9 @@ InternVLChatConfig, JAISConfig, MedusaConfig, MllamaConfig, MLPSpeculatorConfig, MPTConfig, - NemotronConfig, Qwen2VLConfig, - RWConfig, SolarConfig, - UltravoxConfig) + NemotronConfig, NVLM_D_Config, + Qwen2VLConfig, RWConfig, + SolarConfig, UltravoxConfig) # yapf: enable from vllm.transformers_utils.utils import check_gguf_file @@ -55,7 +55,7 @@ "exaone": ExaoneConfig, "internvl_chat": InternVLChatConfig, "nemotron": NemotronConfig, - "NVLM_D": InternVLChatConfig, + "NVLM_D": NVLM_D_Config, "solar": SolarConfig, "ultravox": UltravoxConfig, "qwen2_vl": Qwen2VLConfig, diff --git a/vllm/transformers_utils/configs/__init__.py b/vllm/transformers_utils/configs/__init__.py index 462cd964325d..8d6385d42d00 100644 --- a/vllm/transformers_utils/configs/__init__.py +++ b/vllm/transformers_utils/configs/__init__.py @@ -13,6 +13,7 @@ from vllm.transformers_utils.configs.mlp_speculator import MLPSpeculatorConfig from vllm.transformers_utils.configs.mpt import MPTConfig from vllm.transformers_utils.configs.nemotron import NemotronConfig +from vllm.transformers_utils.configs.nvlm_d import NVLM_D_Config from vllm.transformers_utils.configs.qwen2vl import (Qwen2VLConfig, Qwen2VLVisionConfig) from vllm.transformers_utils.configs.solar import SolarConfig @@ -31,6 +32,7 @@ "MllamaConfig", "MLPSpeculatorConfig", "NemotronConfig", + "NVLM_D_Config", "SolarConfig", "UltravoxConfig", "Qwen2VLConfig", diff --git a/vllm/transformers_utils/configs/nvlm_d.py b/vllm/transformers_utils/configs/nvlm_d.py new file mode 100644 index 000000000000..8007176aecd9 --- /dev/null +++ b/vllm/transformers_utils/configs/nvlm_d.py @@ -0,0 +1,12 @@ +# Adapted from +# https://huggingface.co/nvidia/NVLM-D-72B/blob/main/configuration_nvlm_d.py +# -------------------------------------------------------- +# NVLM-D +# Copyright (c) 2024 NVIDIA +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- +from .internvl import InternVLChatConfig + + +class NVLM_D_Config(InternVLChatConfig): + model_type = 'NVLM_D' From de394064800d6209c63d5c0dbd8c3cd518c5b982 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 17:47:08 +0000 Subject: [PATCH 04/27] Fix `mlp1` loading --- vllm/model_executor/models/internvl.py | 21 +++++++++------ vllm/model_executor/models/nvlm_d.py | 37 ++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 8 deletions(-) create mode 100644 vllm/model_executor/models/nvlm_d.py diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index e84990a2ab10..040a9a979ef5 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -395,14 +395,7 @@ def __init__(self, self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) - vit_hidden_size = config.vision_config.hidden_size - llm_hidden_size = config.text_config.hidden_size - - self.mlp1 = nn.Sequential( - nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2), - nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2, - llm_hidden_size), nn.GELU(), - nn.Linear(llm_hidden_size, llm_hidden_size)) + self.mlp1 = self._init_mlp1(config) self.img_context_token_id = None self.make_empty_intermediate_tensors = ( @@ -413,6 +406,18 @@ def __init__(self, else: self.sampler = Sampler() + def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: + vit_hidden_size = config.vision_config.hidden_size + llm_hidden_size = config.text_config.hidden_size + + return nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2, + llm_hidden_size), + nn.GELU(), + nn.Linear(llm_hidden_size, llm_hidden_size), + ) + def pixel_shuffle(self, x, scale_factor=0.5): n, w, h, c = x.size() # N, W, H, C --> N, W, H * scale, C // scale diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py new file mode 100644 index 000000000000..06f7cb2286f7 --- /dev/null +++ b/vllm/model_executor/models/nvlm_d.py @@ -0,0 +1,37 @@ +# adapted from https://huggingface.co/nvidia/NVLM-D-72B/blob/main/modeling_nvlm_d.py +# -------------------------------------------------------- +# NVLM-D +# Copyright (c) 2024 NVIDIA +# Licensed under Apache 2.0 License [see LICENSE for details] +# -------------------------------------------------------- +import torch.nn as nn +from transformers import PretrainedConfig + +from vllm.inputs import INPUT_REGISTRY +from vllm.multimodal import MULTIMODAL_REGISTRY + + +from .internvl import (InternVLChatModel, dummy_data_for_internvl, + input_mapper_for_internvl, input_processor_for_internvl, + get_max_internvl_image_tokens) + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) +@MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) +@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl) +class NVLM_D_Model(InternVLChatModel): + + def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: + vit_hidden_size = config.vision_config.hidden_size + llm_intermediate_size = config.text_config.intermediate_size + llm_hidden_size = config.text_config.hidden_size + + return nn.Sequential( + nn.LayerNorm(vit_hidden_size * int(1 / self.downsample_ratio)**2), + nn.Linear(vit_hidden_size * int(1 / self.downsample_ratio)**2, + llm_intermediate_size, + bias=False), + nn.GELU(), + nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False), + ) From fe3ba5b3d10cf490f6ea43afbfcfcee9b800f4eb Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 17:48:20 +0000 Subject: [PATCH 05/27] Fix model loading --- vllm/model_executor/models/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index acd537473feb..ff410b79d0b8 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -89,7 +89,7 @@ "LlavaOnevisionForConditionalGeneration": ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), # noqa: E501 "MiniCPMV": ("minicpmv", "MiniCPMV"), "MllamaForConditionalGeneration": ("mllama", "MllamaForConditionalGeneration"), # noqa: E501 - "NVLM_D": ("internvl", "InternVLChatModel"), + "NVLM_D": ("nvlm_d", "NVLM_D_Model"), "PaliGemmaForConditionalGeneration": ("paligemma", "PaliGemmaForConditionalGeneration"), # noqa: E501 "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "PixtralForConditionalGeneration": ("pixtral", "PixtralForConditionalGeneration"), # noqa: E501 From 92454b88199350dc81432bee6ab55f8a54653246 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 18:34:40 +0000 Subject: [PATCH 06/27] Use NVLM-specific modules --- vllm/model_executor/models/intern_vit.py | 53 ++++--- vllm/model_executor/models/nvlm_d.py | 184 ++++++++++++++++++++++- 2 files changed, 214 insertions(+), 23 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 33b4a3acaa55..669fed12a407 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -54,7 +54,7 @@ def __init__(self, config: PretrainedConfig): self.position_embedding = nn.Parameter( torch.randn(1, self.num_positions, self.embed_dim)) - def _get_pos_embed(self, pos_embed, H, W): + def _get_pos_embed(self, pos_embed: torch.Tensor, H: int, W: int): target_dtype = pos_embed.dtype pos_embed = pos_embed.float().reshape( 1, self.image_size // self.patch_size, @@ -63,9 +63,19 @@ def _get_pos_embed(self, pos_embed, H, W): size=(H, W), mode='bicubic', align_corners=False) - pos_embed = pos_embed.reshape(1, -1, H * W).permute(0, 2, - 1).to(target_dtype) - return pos_embed + return pos_embed.reshape(1, -1, H * W).permute(0, 2, + 1).to(target_dtype) + + def _get_position_embedding(self, H: int, W: int) -> torch.Tensor: + position_embedding = self.position_embedding + + return torch.cat( + [ + position_embedding[:, :1, :], + self._get_pos_embed(position_embedding[:, 1:, :], H, W), + ], + dim=1, + ) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: target_dtype = self.patch_embedding.weight.dtype @@ -76,12 +86,7 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) - position_embedding = torch.cat([ - self.position_embedding[:, :1, :], - self._get_pos_embed(self.position_embedding[:, 1:, :], height, - width) - ], - dim=1) + position_embedding = self._get_position_embedding(height, width) embeddings = embeddings + position_embedding.to(target_dtype) return embeddings @@ -241,14 +246,8 @@ def __init__(self, self.intermediate_size = config.intermediate_size self.norm_type = config.norm_type - # fallback to sdpa attention if tp unavailable - tp_size = get_tensor_model_parallel_world_size() - num_heads = config.num_attention_heads - if USE_XFORMERS_OPS and num_heads % tp_size == 0: - self.attn = InternParallelAttention(config, - quant_config=quant_config) - else: - self.attn = InternSdpaAttention(config) + self.attn = self._init_attn(config, quant_config) + self.mlp = InternMLP(config, quant_config=quant_config) self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) @@ -260,6 +259,17 @@ def __init__(self, self.ls2 = nn.Parameter(config.initializer_factor * torch.ones(self.embed_dim)) + def _init_attn(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): + # fallback to sdpa attention if tp unavailable + tp_size = get_tensor_model_parallel_world_size() + num_heads = config.num_attention_heads + + if USE_XFORMERS_OPS and num_heads % tp_size == 0: + return InternParallelAttention(config, quant_config=quant_config) + + return InternSdpaAttention(config) + def forward( self, hidden_states: torch.Tensor, @@ -287,10 +297,15 @@ def __init__(self, else: num_hidden_layers = num_hidden_layers_override self.layers = nn.ModuleList([ - InternVisionEncoderLayer(config=config, quant_config=quant_config) + self._init_encoder_layer(config, quant_config) for _ in range(num_hidden_layers) ]) + def _init_encoder_layer(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): + return InternVisionEncoderLayer(config=config, + quant_config=quant_config) + def forward(self, inputs_embeds: torch.Tensor): hidden_states = inputs_embeds diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 06f7cb2286f7..b7ee9401ca53 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,16 +4,192 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- +from typing import Optional + +import torch import torch.nn as nn +import torch.nn.functional as F from transformers import PretrainedConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import INPUT_REGISTRY +from vllm.model_executor.layers.layernorm import RMSNorm +from vllm.model_executor.layers.linear import (QKVParallelLinear, + RowParallelLinear) +from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal import MULTIMODAL_REGISTRY - +from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, + InternVisionEncoderLayer) from .internvl import (InternVLChatModel, dummy_data_for_internvl, - input_mapper_for_internvl, input_processor_for_internvl, - get_max_internvl_image_tokens) + get_max_internvl_image_tokens, + input_mapper_for_internvl, input_processor_for_internvl) + +try: + from xformers import ops as xops + USE_XFORMERS_OPS = True +except ImportError: + USE_XFORMERS_OPS = False + + +class NVLMVisionEmbeddings(InternVisionEmbeddings): + + def _get_position_embedding(self, H: int, W: int) -> torch.Tensor: + return self.position_embedding + + +class NVLMParallelAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f'embed_dim must be divisible by num_heads ' + f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' + f' {self.num_heads}).') + + # We added additional dummy heads to the original num of heads to make + # the number of heads divisible by 8. + self.num_dummy_heads = 7 + self.dummy_dim = (self.num_dummy_heads + + self.num_heads) * self.head_dim + + self.scale = self.head_dim**-0.5 + self.qkv = QKVParallelLinear( + self.embed_dim, + self.dummy_dim, + self.num_dummy_heads + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + ) + + self.qk_normalization = config.qk_normalization + + if self.qk_normalization: + self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + + self.proj = RowParallelLinear( + self.dummy_dim, + self.embed_dim, + quant_config=quant_config, + ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide( + self.num_dummy_heads + self.num_heads, self.tp_size) + + def forward(self, x): + B, N, C = x.shape + qkv, _ = self.qkv(x) + q, k, v = qkv.chunk(3, dim=-1) + + q = q.view(B, N, self.num_heads_per_partition, self.head_dim) + k = k.view(B, N, self.num_heads_per_partition, self.head_dim) + v = v.view(B, N, self.num_heads_per_partition, self.head_dim) + + if self.qk_normalization: + B_, N_, H_, D_ = q.shape + q = self.q_norm.forward_native( + q.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + k = self.k_norm.forward_native( + k.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + + x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale) + x = x.view(B, N, -1) + + x, _ = self.proj(x) + return x + + +class NVLMSdpaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f'embed_dim must be divisible by num_heads ' + f'(got `embed_dim`: {self.embed_dim} and `num_heads`:' + f' {self.num_heads}).') + + # We added additional dummy heads to the original num of heads to make + # the number of heads divisible by 8. + self.num_dummy_heads = 7 + self.dummy_dim = (self.num_dummy_heads + + self.num_heads) * self.head_dim + + self.scale = self.head_dim**-0.5 + self.qkv = nn.Linear(self.embed_dim, + 3 * self.dummy_dim, + bias=config.qkv_bias) + + self.qk_normalization = config.qk_normalization + + if self.qk_normalization: + self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + + self.proj = nn.Linear(self.dummy_dim, self.embed_dim) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x) + q, k, v = qkv.chunk(3, dim=-1) + + q = q.view(B, N, self.num_dummy_heads + self.num_heads, self.head_dim) + k = k.view(B, N, self.num_dummy_heads + self.num_heads, self.head_dim) + v = v.view(B, N, self.num_dummy_heads + self.num_heads, self.head_dim) + + if self.qk_normalization: + B_, N_, H_, D_ = q.shape + q = self.q_norm.forward_native( + q.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + k = self.k_norm.forward_native( + k.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + + x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) + x = x.transpose(1, 2).view(B, N, -1) + + x = self.proj(x) + return x + + +class NVLMVisionEncoderLayer(InternVisionEncoderLayer): + + def _init_attn(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): + # fallback to sdpa attention if tp unavailable + tp_size = get_tensor_model_parallel_world_size() + num_heads = config.num_attention_heads + + if USE_XFORMERS_OPS and num_heads % tp_size == 0: + return NVLMParallelAttention(config, quant_config=quant_config) + + return NVLMSdpaAttention(config) + + +class NVLMVisionEncoder(InternVisionEncoder): + + def _init_encoder_layer(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig]): + return NVLMVisionEncoderLayer(config=config, quant_config=quant_config) @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) @@ -21,7 +197,7 @@ @INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) @INPUT_REGISTRY.register_input_processor(input_processor_for_internvl) class NVLM_D_Model(InternVLChatModel): - + def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: vit_hidden_size = config.vision_config.hidden_size llm_intermediate_size = config.text_config.intermediate_size From efb8f26a377aece1c39fca07283f6b1e2b4915f6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 18:45:50 +0000 Subject: [PATCH 07/27] Load the correct vision model --- vllm/model_executor/models/intern_vit.py | 14 ++++++++++++-- vllm/model_executor/models/internvl.py | 8 ++++++-- vllm/model_executor/models/nvlm_d.py | 19 ++++++++++++++++++- 3 files changed, 36 insertions(+), 5 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 669fed12a407..dede447f809b 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -325,10 +325,20 @@ def __init__(self, self.config = config self.embeddings = InternVisionEmbeddings(config) - self.encoder = InternVisionEncoder( + self.encoder = self._init_encoder( + config, + quant_config, + num_hidden_layers_override=num_hidden_layers_override, + ) + + def _init_encoder(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + num_hidden_layers_override: Optional[int]): + return InternVisionEncoder( config=config, quant_config=quant_config, - num_hidden_layers_override=num_hidden_layers_override) + num_hidden_layers_override=num_hidden_layers_override, + ) def resize_pos_embeddings(self, old_size, new_size, patch_size): pos_emb = self.embeddings.position_embedding diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 040a9a979ef5..1325b98be38b 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -389,8 +389,7 @@ def __init__(self, + vision_feature_layer + 1 else: num_hidden_layers = vision_feature_layer + 1 - self.vision_model = InternVisionModel( - config.vision_config, num_hidden_layers_override=num_hidden_layers) + self.vision_model = self._init_vision_model(config, num_hidden_layers) self.language_model = init_vllm_registered_model( config.text_config, cache_config, quant_config) @@ -406,6 +405,11 @@ def __init__(self, else: self.sampler = Sampler() + def _init_vision_model(self, config: PretrainedConfig, + num_hidden_layers: int): + return InternVisionModel(config.vision_config, + num_hidden_layers_override=num_hidden_layers) + def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: vit_hidden_size = config.vision_config.hidden_size llm_hidden_size = config.text_config.hidden_size diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index b7ee9401ca53..be8231cddab2 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -20,7 +20,7 @@ from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, - InternVisionEncoderLayer) + InternVisionEncoderLayer, InternVisionModel) from .internvl import (InternVLChatModel, dummy_data_for_internvl, get_max_internvl_image_tokens, input_mapper_for_internvl, input_processor_for_internvl) @@ -192,6 +192,18 @@ def _init_encoder_layer(self, config: PretrainedConfig, return NVLMVisionEncoderLayer(config=config, quant_config=quant_config) +class NVLMVisionModel(InternVisionModel): + + def _init_encoder(self, config: PretrainedConfig, + quant_config: Optional[QuantizationConfig], + num_hidden_layers_override: Optional[int]): + return NVLMVisionEncoder( + config=config, + quant_config=quant_config, + num_hidden_layers_override=num_hidden_layers_override, + ) + + @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) @@ -211,3 +223,8 @@ def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: nn.GELU(), nn.Linear(llm_intermediate_size, llm_hidden_size, bias=False), ) + + def _init_vision_model(self, config: PretrainedConfig, + num_hidden_layers: int): + return NVLMVisionModel(config.vision_config, + num_hidden_layers_override=num_hidden_layers) From 26f4496cd476bc062ed2b4aca2a7c973aa77ad9c Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 19:02:11 +0000 Subject: [PATCH 08/27] Adopt the original version of RMSNorm which uses custom variance --- vllm/model_executor/layers/layernorm.py | 13 +++++- vllm/model_executor/models/nvlm_d.py | 54 +++++++++++++++++++------ 2 files changed, 53 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 14f60e9172f2..8b5230f07c97 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -27,6 +27,7 @@ def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, + variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype @@ -35,7 +36,9 @@ def forward_native( x = x + residual.to(torch.float32) residual = x.to(orig_dtype) - variance = x.pow(2).mean(dim=-1, keepdim=True) + if variance is None: + variance = x.pow(2).mean(dim=-1, keepdim=True) + x = x * torch.rsqrt(variance + self.variance_epsilon) x = x.to(orig_dtype) * self.weight if residual is None: @@ -47,7 +50,11 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, + variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if variance is not None: + return self.forward_native(x, residual, variance) + from vllm import _custom_ops as ops if residual is not None: @@ -71,7 +78,11 @@ def forward_xpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, + variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: + if variance is not None: + return self.forward_native(x, residual, variance) + from vllm._ipex_ops import ipex_ops as ops if residual is not None: diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index be8231cddab2..daa4b2c09ef4 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,7 +4,7 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Optional +from typing import Iterable, Optional, Tuple import torch import torch.nn as nn @@ -17,6 +17,7 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, @@ -75,8 +76,8 @@ def __init__( self.qk_normalization = config.qk_normalization if self.qk_normalization: - self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) - self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.q_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) + self.k_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) self.proj = RowParallelLinear( self.dummy_dim, @@ -99,10 +100,19 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - q = self.q_norm.forward_native( - q.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) - k = self.k_norm.forward_native( - k.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + + q_var = q.transpose(1, 2).flatten( + -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + + k_var = k.transpose(1, 2).flatten( + -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + + q = self.q_norm.forward_native(q.flatten(-2, -1), + variance=q_var).view( + B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten(-2, -1), + variance=k_var).view( + B_, N_, H_, D_) x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale) x = x.view(B, N, -1) @@ -140,8 +150,8 @@ def __init__(self, config: PretrainedConfig): self.qk_normalization = config.qk_normalization if self.qk_normalization: - self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) - self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) + self.q_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) + self.k_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) self.proj = nn.Linear(self.dummy_dim, self.embed_dim) @@ -156,10 +166,20 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - q = self.q_norm.forward_native( - q.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) - k = self.k_norm.forward_native( - k.flatten(-2, -1)[:, :, :self.embed_dim]).view(B_, N_, H_, D_) + + q_var = q.transpose(1, 2).flatten( + -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + + k_var = k.transpose(1, 2).flatten( + -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + + q = self.q_norm.forward_native(q.flatten(-2, -1), + variance=q_var).view( + B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten(-2, -1), + variance=k_var).view( + B_, N_, H_, D_) + q = q.transpose(1, 2) k = k.transpose(1, 2) v = v.transpose(1, 2) @@ -203,6 +223,14 @@ def _init_encoder(self, config: PretrainedConfig, num_hidden_layers_override=num_hidden_layers_override, ) + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + params_dict = dict(self.named_parameters()) + for name, loaded_weight in weights: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) + @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) From d76ef5006d05dc070333722eceb35310e7b8bff9 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Thu, 3 Oct 2024 19:20:41 +0000 Subject: [PATCH 09/27] Remove extra transpose --- vllm/model_executor/models/nvlm_d.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index daa4b2c09ef4..262d9748c98e 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -101,11 +101,10 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - q_var = q.transpose(1, 2).flatten( - -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) - - k_var = k.transpose(1, 2).flatten( - -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + q_var = q.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( + -1, keepdim=True) + k_var = k.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( + -1, keepdim=True) q = self.q_norm.forward_native(q.flatten(-2, -1), variance=q_var).view( @@ -167,11 +166,10 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - q_var = q.transpose(1, 2).flatten( - -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) - - k_var = k.transpose(1, 2).flatten( - -2, -1)[:, :, :self.embed_dim].pow(2).mean(-1, keepdim=True) + q_var = q.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( + -1, keepdim=True) + k_var = k.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( + -1, keepdim=True) q = self.q_norm.forward_native(q.flatten(-2, -1), variance=q_var).view( From dead63e2a0dba49249e29d4965619cabc61ccead Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 02:44:04 +0000 Subject: [PATCH 10/27] Simplify code --- vllm/model_executor/layers/layernorm.py | 21 ++++++----- vllm/model_executor/models/nvlm_d.py | 48 +++++++++++-------------- 2 files changed, 32 insertions(+), 37 deletions(-) diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py index 8b5230f07c97..7d2e6e6ed4b3 100644 --- a/vllm/model_executor/layers/layernorm.py +++ b/vllm/model_executor/layers/layernorm.py @@ -18,16 +18,17 @@ def __init__( self, hidden_size: int, eps: float = 1e-6, + var_hidden_size: Optional[int] = None, ) -> None: super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) self.variance_epsilon = eps + self.var_hidden_size = var_hidden_size def forward_native( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """PyTorch-native implementation equivalent to forward().""" orig_dtype = x.dtype @@ -36,8 +37,12 @@ def forward_native( x = x + residual.to(torch.float32) residual = x.to(orig_dtype) - if variance is None: - variance = x.pow(2).mean(dim=-1, keepdim=True) + if self.var_hidden_size is None: + x_var = x + else: + x_var = x[:, :, :self.var_hidden_size] + + variance = x_var.pow(2).mean(dim=-1, keepdim=True) x = x * torch.rsqrt(variance + self.variance_epsilon) x = x.to(orig_dtype) * self.weight @@ -50,10 +55,9 @@ def forward_cuda( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if variance is not None: - return self.forward_native(x, residual, variance) + if self.var_hidden_size is not None: + return self.forward_native(x, residual) from vllm import _custom_ops as ops @@ -78,10 +82,9 @@ def forward_xpu( self, x: torch.Tensor, residual: Optional[torch.Tensor] = None, - variance: Optional[torch.Tensor] = None, ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: - if variance is not None: - return self.forward_native(x, residual, variance) + if self.var_hidden_size is not None: + return self.forward_native(x, residual) from vllm._ipex_ops import ipex_ops as ops diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 262d9748c98e..865ab59c6fae 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -76,8 +76,12 @@ def __init__( self.qk_normalization = config.qk_normalization if self.qk_normalization: - self.q_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) - self.k_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) + self.q_norm = RMSNorm(self.dummy_dim, + eps=config.layer_norm_eps, + var_hidden_size=self.embed_dim) + self.k_norm = RMSNorm(self.dummy_dim, + eps=config.layer_norm_eps, + var_hidden_size=self.embed_dim) self.proj = RowParallelLinear( self.dummy_dim, @@ -100,18 +104,10 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - - q_var = q.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( - -1, keepdim=True) - k_var = k.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( - -1, keepdim=True) - - q = self.q_norm.forward_native(q.flatten(-2, -1), - variance=q_var).view( - B_, N_, H_, D_) - k = self.k_norm.forward_native(k.flatten(-2, -1), - variance=k_var).view( - B_, N_, H_, D_) + q = self.q_norm.forward_native(q.flatten(-2, + -1)).view(B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten(-2, + -1)).view(B_, N_, H_, D_) x = xops.memory_efficient_attention_forward(q, k, v, scale=self.scale) x = x.view(B, N, -1) @@ -149,8 +145,12 @@ def __init__(self, config: PretrainedConfig): self.qk_normalization = config.qk_normalization if self.qk_normalization: - self.q_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) - self.k_norm = RMSNorm(self.dummy_dim, eps=config.layer_norm_eps) + self.q_norm = RMSNorm(self.dummy_dim, + eps=config.layer_norm_eps, + var_hidden_size=self.embed_dim) + self.k_norm = RMSNorm(self.dummy_dim, + eps=config.layer_norm_eps, + var_hidden_size=self.embed_dim) self.proj = nn.Linear(self.dummy_dim, self.embed_dim) @@ -165,18 +165,10 @@ def forward(self, x): if self.qk_normalization: B_, N_, H_, D_ = q.shape - - q_var = q.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( - -1, keepdim=True) - k_var = k.flatten(-2, -1)[:, :, :self.embed_dim].pow(2).mean( - -1, keepdim=True) - - q = self.q_norm.forward_native(q.flatten(-2, -1), - variance=q_var).view( - B_, N_, H_, D_) - k = self.k_norm.forward_native(k.flatten(-2, -1), - variance=k_var).view( - B_, N_, H_, D_) + q = self.q_norm.forward_native(q.flatten(-2, + -1)).view(B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten(-2, + -1)).view(B_, N_, H_, D_) q = q.transpose(1, 2) k = k.transpose(1, 2) From f0d3003f2cf152a9b60aab5a3c18a97ca5e0d492 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 02:46:27 +0000 Subject: [PATCH 11/27] Remove unused code --- vllm/model_executor/models/intern_vit.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index dede447f809b..6347be875243 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -340,23 +340,6 @@ def _init_encoder(self, config: PretrainedConfig, num_hidden_layers_override=num_hidden_layers_override, ) - def resize_pos_embeddings(self, old_size, new_size, patch_size): - pos_emb = self.embeddings.position_embedding - _, num_positions, embed_dim = pos_emb.shape - cls_emb = pos_emb[:, :1, :] - pos_emb = pos_emb[:, 1:, :].reshape(1, old_size // patch_size, - old_size // patch_size, - -1).permute(0, 3, 1, 2) - pos_emb = F.interpolate(pos_emb.float(), - size=new_size // patch_size, - mode='bicubic', - align_corners=False) - pos_emb = pos_emb.to(cls_emb.dtype).reshape(1, embed_dim, - -1).permute(0, 2, 1) - pos_emb = torch.cat([cls_emb, pos_emb], dim=1) - self.embeddings.position_embedding = nn.Parameter(pos_emb) - self.embeddings.image_size = new_size - def get_input_embeddings(self): return self.embeddings From 72a71d5f61e7c9d0ecda04a73c0a434825aabfe6 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:05:05 +0000 Subject: [PATCH 12/27] Update input processing --- vllm/model_executor/models/internvl.py | 115 +++++++++++++++---------- vllm/model_executor/models/nvlm_d.py | 40 +++++++-- 2 files changed, 104 insertions(+), 51 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 1325b98be38b..f755cbe9f8e8 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -6,8 +6,8 @@ # -------------------------------------------------------- import re from functools import partial -from typing import (Any, Dict, Iterable, List, Literal, Mapping, Optional, - Tuple, TypedDict, Union) +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Tuple, TypedDict, Union) import torch import torch.nn as nn @@ -238,57 +238,80 @@ def get_max_internvl_image_size(ctx: InputContext, return width, height -def input_processor_for_internvl(ctx: InputContext, - llm_inputs: LLMInputs, - *, - max_dynamic_patch: Optional[int] = None): - multi_modal_data = llm_inputs.get("multi_modal_data") - if multi_modal_data is None or "image" not in multi_modal_data: - return llm_inputs - - model_config = ctx.model_config - hf_config = ctx.get_hf_config() - - image_data = multi_modal_data["image"] - num_patches = get_internvl_num_patches(hf_config) - num_blocks_calculator = calculate_num_blocks_wrapper( - hf_config, max_dynamic_patch) - if isinstance(image_data, Image.Image): - width, height = image_data.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_size = [num_blocks * num_patches] - elif is_list_of(image_data, Image.Image): - image_feature_size = [] - for image in image_data: - width, height = image.size - num_blocks, _, _ = num_blocks_calculator(width, height) - image_feature_size.append(num_blocks * num_patches) - elif isinstance(image_data, torch.Tensor): - num_images, image_feature_size, hidden_size = image_data.shape - else: - raise TypeError(f"Invalid image type: {type(image_data)}") - - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - - prompt = llm_inputs.get("prompt") - prompt_token_ids = llm_inputs["prompt_token_ids"] - if prompt is None: - prompt = tokenizer.decode(prompt_token_ids) - +def _expand_internvl_image_prompt( + prompt: str, + image_feature_sizes: List[int], + num_patches: int, +) -> str: new_prompt = prompt image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) - for idx, feature_size in enumerate(image_feature_size, start=1): + for idx, feature_size in enumerate(image_feature_sizes, start=1): image_prompt = IMG_START + IMG_CONTEXT * feature_size + IMG_END if not image_idx: image_prompt = f"Image-{idx}: {image_prompt}" new_prompt = new_prompt.replace('', image_prompt, 1) - new_prompt_token_ids = tokenizer.encode(new_prompt) - return LLMInputs(prompt=prompt, - prompt_token_ids=new_prompt_token_ids, - multi_modal_data=multi_modal_data) + return new_prompt + + +def build_input_processor(expand_image_prompt: Callable[[str, List[int], int], + str]): + + def input_processor( + ctx: InputContext, + llm_inputs: LLMInputs, + *, + max_dynamic_patch: Optional[int] = None, + ) -> LLMInputs: + multi_modal_data = llm_inputs.get("multi_modal_data") + if multi_modal_data is None or "image" not in multi_modal_data: + return llm_inputs + + model_config = ctx.model_config + hf_config = ctx.get_hf_config() + + image_data = multi_modal_data["image"] + num_patches = get_internvl_num_patches(hf_config) + num_blocks_calculator = calculate_num_blocks_wrapper( + hf_config, max_dynamic_patch) + if isinstance(image_data, Image.Image): + width, height = image_data.size + num_blocks, _, _ = num_blocks_calculator(width, height) + image_feature_sizes = [num_blocks * num_patches] + elif is_list_of(image_data, Image.Image): + image_feature_sizes = [] + for image in image_data: + width, height = image.size + num_blocks, _, _ = num_blocks_calculator(width, height) + image_feature_sizes.append(num_blocks * num_patches) + elif isinstance(image_data, torch.Tensor): + num_images, image_feature_size, hidden_size = image_data.shape + image_feature_sizes = [image_feature_size] + else: + raise TypeError(f"Invalid image type: {type(image_data)}") + + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) + + prompt = llm_inputs.get("prompt") + prompt_token_ids = llm_inputs["prompt_token_ids"] + if prompt is None: + prompt = tokenizer.decode(prompt_token_ids) + + new_prompt = expand_image_prompt(prompt, image_feature_sizes, + num_patches) + new_prompt_token_ids = tokenizer.encode(new_prompt) + + return LLMInputs(prompt=prompt, + prompt_token_ids=new_prompt_token_ids, + multi_modal_data=multi_modal_data) + + return input_processor + + +input_processor_for_internvl = build_input_processor( + _expand_internvl_image_prompt) def input_mapper_for_internvl(ctx: InputContext, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 865ab59c6fae..4d84622e0c11 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,7 +4,8 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Iterable, Optional, Tuple +import re +from typing import Iterable, List, Optional, Tuple import torch import torch.nn as nn @@ -22,9 +23,9 @@ from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, InternVisionEncoderLayer, InternVisionModel) -from .internvl import (InternVLChatModel, dummy_data_for_internvl, - get_max_internvl_image_tokens, - input_mapper_for_internvl, input_processor_for_internvl) +from .internvl import (InternVLChatModel, build_input_processor, + dummy_data_for_internvl, get_max_internvl_image_tokens, + input_mapper_for_internvl) try: from xformers import ops as xops @@ -32,6 +33,35 @@ except ImportError: USE_XFORMERS_OPS = False +IMG_START = '<|vision_start|>' +IMG_END = '<|vision_end|>' +IMG_CONTEXT = '<|vision_pad|>' + + +def _expand_nvlm_image_prompt( + prompt: str, + image_feature_sizes: List[int], + num_patches: int, +) -> str: + new_prompt = prompt + image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) + for idx, feature_size in enumerate(image_feature_sizes, start=1): + tile_pos_identifiers = ([f"" + for i in range(1, num_patches)] + + [""]) + image_prompt = '' + ''.join( + tile_pos_identifier + IMG_CONTEXT * feature_size + for tile_pos_identifier in tile_pos_identifiers) + '' + + if not image_idx: + image_prompt = f"Image-{idx}: {image_prompt}" + new_prompt = new_prompt.replace('', image_prompt, 1) + + return new_prompt + + +input_processor_for_nvlm = build_input_processor(_expand_nvlm_image_prompt) + class NVLMVisionEmbeddings(InternVisionEmbeddings): @@ -225,7 +255,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): @MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) @INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) -@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl) +@INPUT_REGISTRY.register_input_processor(input_processor_for_nvlm) class NVLM_D_Model(InternVLChatModel): def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: From 1a8fd37834fe8d4735673405cfc0f028c5bfd182 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:07:49 +0000 Subject: [PATCH 13/27] Format --- vllm/model_executor/models/internvl.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 50273cd23644..7e1c8478a50e 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -6,8 +6,8 @@ # -------------------------------------------------------- import re from functools import cached_property, partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, Tuple, - TypedDict, Union) +from typing import (Callable, Iterable, List, Literal, Mapping, Optional, + Tuple, TypedDict, Union) import torch import torch.nn as nn From bfd910a9cf9544a16f2e3e611a8d893f87a5c69b Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:26:52 +0000 Subject: [PATCH 14/27] Fix and abstract input pipeline --- vllm/model_executor/models/internvl.py | 186 ++++++++++++++----------- vllm/model_executor/models/nvlm_d.py | 39 ++---- 2 files changed, 117 insertions(+), 108 deletions(-) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 7e1c8478a50e..5048e9aa240c 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -6,8 +6,8 @@ # -------------------------------------------------------- import re from functools import cached_property, partial -from typing import (Callable, Iterable, List, Literal, Mapping, Optional, - Tuple, TypedDict, Union) +from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, + TypedDict, Union) import torch import torch.nn as nn @@ -237,26 +237,45 @@ def get_max_internvl_image_size(ctx: InputContext, return width, height -def _expand_internvl_image_prompt( - prompt: str, - image_feature_sizes: List[int], - num_patches: int, -) -> str: - new_prompt = prompt - image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) - for idx, feature_size in enumerate(image_feature_sizes, start=1): - image_prompt = IMG_START + IMG_CONTEXT * feature_size + IMG_END - if not image_idx: - image_prompt = f"Image-{idx}: {image_prompt}" - new_prompt = new_prompt.replace('', image_prompt, 1) +class InternVLInputPipeline: - return new_prompt + def __init__( + self, + img_start_token: str, + img_end_token: str, + img_context_token: str, + ) -> None: + super().__init__() + + self.img_start_token = img_start_token + self.img_end_token = img_end_token + self.img_context_token = img_context_token + + def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: + return (self.img_start_token + self.img_context_token * feature_size + + self.img_end_token) + def _expand_image_prompt( + self, + prompt: str, + feature_sizes: List[int], + num_patches: int, + ) -> str: + image_idx = sorted( + map(int, re.findall(r"Image-(\d+): \n", prompt))) + + new_prompt = prompt + for idx, feature_size in enumerate(feature_sizes, start=1): + image_prompt = self._create_image_prompt(feature_size, num_patches) + if not image_idx: + image_prompt = f"Image-{idx}: {image_prompt}" -def build_input_processor(expand_image_prompt: Callable[[str, List[int], int], - str]): + new_prompt = new_prompt.replace('', image_prompt, 1) + + return new_prompt def input_processor( + self, ctx: InputContext, llm_inputs: LLMInputs, *, @@ -298,92 +317,93 @@ def input_processor( if prompt is None: prompt = tokenizer.decode(prompt_token_ids) - new_prompt = expand_image_prompt(prompt, image_feature_sizes, - num_patches) + new_prompt = self._expand_image_prompt(prompt, image_feature_sizes, + num_patches) new_prompt_token_ids = tokenizer.encode(new_prompt) return LLMInputs(prompt=prompt, prompt_token_ids=new_prompt_token_ids, multi_modal_data=multi_modal_data) - return input_processor + def input_mapper( + self, + ctx: InputContext, + data: object, + *, + max_dynamic_patch: Optional[int] = None, + ): + hf_config = ctx.get_hf_config() + image_pixel_values_mapper = image_to_pixel_values_wrapper( + hf_config, max_dynamic_patch) + if isinstance(data, Image.Image): + data = image_pixel_values_mapper(data) + # Add an N dimension for number of images per prompt (currently 1). + data = data.unsqueeze(0) + elif is_list_of(data, Image.Image): + # we can't stack here because images may have different num_patches + data = [image_pixel_values_mapper(img) for img in data] + model_config = ctx.model_config + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) + image_token_id = tokenizer.encode(self.img_context_token, + add_special_tokens=False, + return_tensors="pt")[0] -input_processor_for_internvl = build_input_processor( - _expand_internvl_image_prompt) + return MultiModalInputs({ + "pixel_values": data, + "image_token_id": image_token_id + }) + def dummy_data( + self, + ctx: InputContext, + seq_len: int, + mm_counts: Mapping[str, int], + *, + max_dynamic_patch: Optional[int] = None, + ): + num_images = mm_counts["image"] -def input_mapper_for_internvl(ctx: InputContext, - data: object, - *, - max_dynamic_patch: Optional[int] = None): - hf_config = ctx.get_hf_config() + hf_config = ctx.get_hf_config() - image_pixel_values_mapper = image_to_pixel_values_wrapper( - hf_config, max_dynamic_patch) - if isinstance(data, Image.Image): - data = image_pixel_values_mapper(data) - # Add an N dimension for number of images per prompt (currently 1). - data = data.unsqueeze(0) - elif is_list_of(data, Image.Image): - # we can't stack here because the images may have different num_patches - data = [image_pixel_values_mapper(img) for img in data] - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) - image_token_id = tokenizer.encode(IMG_CONTEXT, - add_special_tokens=False, - return_tensors="pt")[0] - - return MultiModalInputs({ - "pixel_values": data, - "image_token_id": image_token_id - }) - - -def dummy_data_for_internvl(ctx: InputContext, - seq_len: int, - mm_counts: Mapping[str, int], - *, - max_dynamic_patch: Optional[int] = None): - num_images = mm_counts["image"] + image_feature_size = get_max_internvl_image_tokens( + ctx, max_dynamic_patch=max_dynamic_patch) + model_config = ctx.model_config + tokenizer = cached_get_tokenizer( + model_config.tokenizer, + trust_remote_code=model_config.trust_remote_code) - hf_config = ctx.get_hf_config() + seq_data = dummy_seq_data_for_clip( + hf_config.vision_config, + seq_len, + num_images, + image_token_id=tokenizer.encode(self.img_context_token, + add_special_tokens=False)[0], + image_feature_size_override=image_feature_size, + ) - image_feature_size = get_max_internvl_image_tokens( - ctx, max_dynamic_patch=max_dynamic_patch) - model_config = ctx.model_config - tokenizer = cached_get_tokenizer( - model_config.tokenizer, - trust_remote_code=model_config.trust_remote_code) + max_image_width, max_image_height = get_max_internvl_image_size( + ctx, max_dynamic_patch=max_dynamic_patch) - seq_data = dummy_seq_data_for_clip( - hf_config.vision_config, - seq_len, - num_images, - image_token_id=tokenizer.encode(IMG_CONTEXT, - add_special_tokens=False)[0], - image_feature_size_override=image_feature_size, - ) + mm_data = dummy_image_for_clip( + hf_config.vision_config, + num_images, + image_width_override=max_image_width, + image_height_override=max_image_height, + ) - max_image_width, max_image_height = get_max_internvl_image_size( - ctx, max_dynamic_patch=max_dynamic_patch) + return seq_data, mm_data - mm_data = dummy_image_for_clip( - hf_config.vision_config, - num_images, - image_width_override=max_image_width, - image_height_override=max_image_height, - ) - return seq_data, mm_data +input_pipeline = InternVLInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) -@INPUT_REGISTRY.register_input_processor(input_processor_for_internvl) +@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) +@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class InternVLChatModel(nn.Module, SupportsMultiModal, SupportsPP): def __init__(self, diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 4d84622e0c11..678ac1c7f171 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,8 +4,7 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -import re -from typing import Iterable, List, Optional, Tuple +from typing import Iterable, Optional, Tuple import torch import torch.nn as nn @@ -23,9 +22,8 @@ from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, InternVisionEncoderLayer, InternVisionModel) -from .internvl import (InternVLChatModel, build_input_processor, - dummy_data_for_internvl, get_max_internvl_image_tokens, - input_mapper_for_internvl) +from .internvl import (InternVLChatModel, InternVLInputPipeline, + get_max_internvl_image_tokens) try: from xformers import ops as xops @@ -38,29 +36,20 @@ IMG_CONTEXT = '<|vision_pad|>' -def _expand_nvlm_image_prompt( - prompt: str, - image_feature_sizes: List[int], - num_patches: int, -) -> str: - new_prompt = prompt - image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) - for idx, feature_size in enumerate(image_feature_sizes, start=1): +class NVLMInputPipeline(InternVLInputPipeline): + + def _create_image_prompt(self, feature_size: int, num_patches: int) -> str: tile_pos_identifiers = ([f"" for i in range(1, num_patches)] + [""]) - image_prompt = '' + ''.join( - tile_pos_identifier + IMG_CONTEXT * feature_size - for tile_pos_identifier in tile_pos_identifiers) + '' + context_size = feature_size // num_patches - if not image_idx: - image_prompt = f"Image-{idx}: {image_prompt}" - new_prompt = new_prompt.replace('', image_prompt, 1) - - return new_prompt + return '' + ''.join( + tile_pos_identifier + self.img_context_token * context_size + for tile_pos_identifier in tile_pos_identifiers) + '' -input_processor_for_nvlm = build_input_processor(_expand_nvlm_image_prompt) +input_pipeline = NVLMInputPipeline(IMG_START, IMG_END, IMG_CONTEXT) class NVLMVisionEmbeddings(InternVisionEmbeddings): @@ -252,10 +241,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight) -@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_internvl) +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) -@INPUT_REGISTRY.register_dummy_data(dummy_data_for_internvl) -@INPUT_REGISTRY.register_input_processor(input_processor_for_nvlm) +@INPUT_REGISTRY.register_dummy_data(input_pipeline.dummy_data) +@INPUT_REGISTRY.register_input_processor(input_pipeline.input_processor) class NVLM_D_Model(InternVLChatModel): def _init_mlp1(self, config: PretrainedConfig) -> nn.Sequential: From fc710a1cc91e0e7661933871ad4c493a589426a3 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:31:50 +0000 Subject: [PATCH 15/27] Add support for online serving --- vllm/entrypoints/chat_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index 130f3ba49f3e..83c4062dd511 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -157,7 +157,7 @@ def _placeholder_str(self, modality: ModalityStr, if model_type.startswith("llava"): return self._cached_token_str(self._tokenizer, hf_config.image_token_index) - if model_type in ("chameleon", "internvl_chat"): + if model_type in ("chameleon", "internvl_chat", "NVLM_D"): return "" if model_type == "mllama": return "<|image|>" From 6d38309f710635a5d007984764f11569016f4136 Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:33:06 +0000 Subject: [PATCH 16/27] Fix wrong embeddings --- vllm/model_executor/models/intern_vit.py | 5 ++++- vllm/model_executor/models/nvlm_d.py | 14 ++++---------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 6347be875243..e6669b17e266 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -324,13 +324,16 @@ def __init__(self, super().__init__() self.config = config - self.embeddings = InternVisionEmbeddings(config) + self.embeddings = self._init_embeddings(config) self.encoder = self._init_encoder( config, quant_config, num_hidden_layers_override=num_hidden_layers_override, ) + def _init_embeddings(self, config: PretrainedConfig): + return InternVisionEmbeddings(config) + def _init_encoder(self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], num_hidden_layers_override: Optional[int]): diff --git a/vllm/model_executor/models/nvlm_d.py b/vllm/model_executor/models/nvlm_d.py index 678ac1c7f171..c1378ffd3e90 100644 --- a/vllm/model_executor/models/nvlm_d.py +++ b/vllm/model_executor/models/nvlm_d.py @@ -4,7 +4,7 @@ # Copyright (c) 2024 NVIDIA # Licensed under Apache 2.0 License [see LICENSE for details] # -------------------------------------------------------- -from typing import Iterable, Optional, Tuple +from typing import Optional import torch import torch.nn as nn @@ -17,7 +17,6 @@ from vllm.model_executor.layers.linear import (QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.multimodal import MULTIMODAL_REGISTRY from .intern_vit import (InternVisionEmbeddings, InternVisionEncoder, @@ -223,6 +222,9 @@ def _init_encoder_layer(self, config: PretrainedConfig, class NVLMVisionModel(InternVisionModel): + def _init_embeddings(self, config: PretrainedConfig): + return NVLMVisionEmbeddings(config) + def _init_encoder(self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig], num_hidden_layers_override: Optional[int]): @@ -232,14 +234,6 @@ def _init_encoder(self, config: PretrainedConfig, num_hidden_layers_override=num_hidden_layers_override, ) - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - params_dict = dict(self.named_parameters()) - for name, loaded_weight in weights: - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) - @MULTIMODAL_REGISTRY.register_image_input_mapper(input_pipeline.input_mapper) @MULTIMODAL_REGISTRY.register_max_image_tokens(get_max_internvl_image_tokens) From 3a89f90b33739977c99f2897929db2473cca96ca Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 03:45:58 +0000 Subject: [PATCH 17/27] Fix docs --- docs/source/models/supported_models.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index e624739e0493..03ae8609a52b 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -333,7 +333,8 @@ Multimodal Language Models - Image - :code:`meta-llama/Llama-3.2-90B-Vision-Instruct`, :code:`meta-llama/Llama-3.2-11B-Vision`, etc. - - * - :code:`NVLM_D` + - + * - :code:`NVLM_D_Model` - NVLM-D 1.0 - Image\ :sup:`E+` - :code:`nvidia/NVLM-D-72B`, etc. From 83d54f255811239e5cbb72eb3d132755e536fc3d Mon Sep 17 00:00:00 2001 From: DarkLight1337 Date: Fri, 4 Oct 2024 05:02:54 +0000 Subject: [PATCH 18/27] Update examples --- examples/offline_inference_vision_language.py | 55 ++++++++++++++----- ...e_inference_vision_language_multi_image.py | 34 ++++++++++++ 2 files changed, 74 insertions(+), 15 deletions(-) diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index b94ef537d783..efad7e33793d 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -18,7 +18,7 @@ # LLaVA-1.5 -def run_llava(question, modality): +def run_llava(question: str, modality: str): assert modality == "image" prompt = f"USER: \n{question}\nASSISTANT:" @@ -29,7 +29,7 @@ def run_llava(question, modality): # LLaVA-1.6/LLaVA-NeXT -def run_llava_next(question, modality): +def run_llava_next(question: str, modality: str): assert modality == "image" prompt = f"[INST] \n{question} [/INST]" @@ -40,7 +40,7 @@ def run_llava_next(question, modality): # LlaVA-NeXT-Video # Currently only support for video input -def run_llava_next_video(question, modality): +def run_llava_next_video(question: str, modality: str): assert modality == "video" prompt = f"USER: