From 0919bd134576bfa4abf84f011596be28b710cea0 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 11:02:30 +0900 Subject: [PATCH 01/29] feat: replace siglipattention with tp'ed one --- vllm/model_executor/models/siglip.py | 204 ++++----------------------- 1 file changed, 28 insertions(+), 176 deletions(-) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 5ba14f73394f..a29710110d4c 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -7,13 +7,11 @@ import torch from PIL import Image from torch import nn -from transformers import SiglipVisionConfig -from transformers.models.siglip.modeling_siglip import SiglipAttention -from vllm_flash_attn import flash_attn_func -from xformers.ops import memory_efficient_attention +from transformers import SiglipVisionConfig, SiglipVisionConfig +from xformers import ops as xops from vllm.config import ModelConfig -from vllm.distributed import get_tensor_model_parallel_world_size +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -210,9 +208,7 @@ def forward(self, return embeddings -# NOTE: Not used - kept for later when we TP the ViT -# TODO(ChristopherCho): Implement TP version of Attention -class SiglipTPAttention(nn.Module): +class SiglipAttention(nn.Module): def __init__( self, @@ -222,38 +218,30 @@ def __init__( super().__init__() self.config = config self.embed_dim = config.hidden_size - - tp_size = get_tensor_model_parallel_world_size() - self.total_num_heads = config.num_attention_heads - if self.total_num_heads % tp_size != 0: - raise ValueError( - f"Number of attention heads ({self.total_num_heads}) " - "must be divisible by the tensor model parallel size" - f" ({tp_size}).") - - self.num_heads = self.total_num_heads // tp_size - self.head_dim = self.embed_dim // self.total_num_heads - if self.head_dim * self.total_num_heads != self.embed_dim: + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: raise ValueError(f"embed_dim must be divisible by num_heads (got " "`embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).") - self.qkv_size = self.num_heads * self.head_dim + self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout - self.qkv_proj = QKVParallelLinear( hidden_size=self.embed_dim, head_size=self.head_dim, - total_num_heads=self.total_num_heads, + total_num_heads=self.num_heads, quant_config=quant_config, ) + self.out_proj = RowParallelLinear( input_size=self.embed_dim, output_size=self.embed_dim, quant_config=quant_config, ) - self.attn_fn = self._basic_attention_forward + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) def forward( self, @@ -263,163 +251,27 @@ def forward( batch_size, q_len, _ = hidden_states.size() qkv_states, _ = self.qkv_proj(hidden_states) - query_states, key_states, value_states = qkv_states.split( - [self.qkv_size] * 3, dim=-1) - - attn_output = self.attn_fn( - q=query_states, - k=key_states, - v=value_states, - batch_size=batch_size, - q_len=q_len, - ) - - attn_output, _ = self.out_proj(attn_output) - return attn_output - - def _basic_attention_forward(self, q, k, v, batch_size, q_len): - q = q.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - k = k.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - v = v.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - k_v_seq_len = k.shape[-2] - attn_weights = torch.matmul(q, k.transpose(2, 3)) * self.scale - - if attn_weights.size() != ( - batch_size, - self.num_heads, - q_len, - k_v_seq_len, - ): - raise ValueError( - "Attention weights should be of size " - f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" - f" {attn_weights.size()}") - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, - dim=-1, - dtype=torch.float32).to(q.dtype) - attn_weights = nn.functional.dropout(attn_weights, - p=self.dropout, - training=self.training) - attn_output = torch.matmul(attn_weights, v) - - if attn_output.size() != ( - batch_size, - self.num_heads, - q_len, - self.head_dim, - ): - raise ValueError( - "`attn_output` should be of size " - f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}") - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - return attn_output - - -# NOTE: Not used - kept for later when we TP the ViT -# TODO(ChristopherCho): flash_attn_func is not working properly. -# It constantly throws a CUDA error. -class SiglipFlashAttention2(SiglipTPAttention): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.attn_fn = self._flash_attention_forward - - # Ported from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L449 - # and https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/modeling_flash_attention_utils.py#L133 - def _flash_attention_forward(self, q, k, v, batch_size, q_len, *args, - **kwargs): - """Implements the multihead softmax attention. - Arguments - --------- - q, k, v: The tensor containing the - query, key, and value. (B, S, H, D) - """ - - q = q.view(batch_size, q_len, self.num_heads, self.head_dim) - k = k.view(batch_size, q_len, self.num_heads, self.head_dim) - v = v.view(batch_size, q_len, self.num_heads, self.head_dim) - - attn_output = flash_attn_func( - q, - k, - v, - dropout_p=self.dropout, - causal=False, - ) - - attn_output = attn_output.reshape(batch_size, q_len, + query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) + + query_states = query_states.view(batch_size, q_len, self.num_heads_per_partition, + self.head_dim) + key_states = key_states.view(batch_size, q_len, self.num_heads_per_partition, + self.head_dim) + value_states = value_states.view(batch_size, q_len, self.num_heads_per_partition, + self.head_dim) + + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) + out = out.reshape(batch_size, q_len, self.embed_dim).contiguous() + attn_output, _ = self.out_proj(out) return attn_output -# NOTE: Not used - kept for later when we TP the ViT -class SiglipSdpaAttention(SiglipTPAttention): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.is_causal = False - self.attn_fn = self._sdpa_attention_forward - - def _sdpa_attention_forward(self, q, k, v, batch_size, q_len): - q = q.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - k = k.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - v = v.view(batch_size, q_len, self.num_heads, - self.head_dim).transpose(1, 2) - - attn_output = torch.nn.functional.scaled_dot_product_attention( - q, k, v, dropout_p=self.dropout, is_causal=False, scale=self.scale) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.view(batch_size, q_len, self.embed_dim) - - return attn_output - - -# NOTE: Not used - kept for later when we TP the ViT -class SiglipxFormersAttention(SiglipTPAttention): - - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.attn_fn = self._xformers_attention_forward - - def _xformers_attention_forward(self, q, k, v, batch_size, q_len): - q = q.view(batch_size, q_len, self.num_heads, self.head_dim) - k = k.view(batch_size, q_len, self.num_heads, self.head_dim) - v = v.view(batch_size, q_len, self.num_heads, self.head_dim) - - attn_output = memory_efficient_attention(q, - k, - v, - p=0.0, - scale=self.scale) - attn_output = attn_output.reshape(batch_size, q_len, - self.embed_dim).contiguous() - - return attn_output - - -# NOTE: Not used - kept for later when we TP the ViT -SIGLIP_ATTENTION_CLASSES = { - "eager": SiglipTPAttention, - "flash_attention_2": SiglipFlashAttention2, - "sdpa": SiglipSdpaAttention, - "xformers": SiglipxFormersAttention, -} - - class SiglipMLP(nn.Module): def __init__( From 7cfc98c8b01063000b2b4c9431846853a3ae2b46 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:21:08 +0900 Subject: [PATCH 02/29] feat: tp blip attention --- vllm/model_executor/models/blip.py | 75 +++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 0b124d5e8a85..bc2529e4d1aa 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -6,17 +6,19 @@ import torch.nn as nn from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig -from transformers.models.blip.modeling_blip import BlipAttention from vllm.config import ModelConfig from vllm.inputs import LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import SequenceData +from vllm.distributed import divide, get_tensor_model_parallel_world_size +from xformers import ops as xops def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -150,6 +152,77 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings +class BlipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + config, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.num_heads, + quant_config=quant_config, + ) + self.projection = RowParallelLinear( + self.embed_dim, + self.embed_dim, + quant_config=quant_config, + ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + ): + """Input shape: Batch x Time x Channel""" + bsz, tgt_len, _ = hidden_states.size() + + qkv_states, _ = self.qkv(hidden_states) + query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) + query_states = query_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + key_states = key_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + value_states = value_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + + out = xops.memory_efficient_attention_forward( + query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale + ) + out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() + attn_output, _ = self.out_proj(out) + + return attn_output + + class BlipMLP(nn.Module): def __init__(self, From 079c53fc17a58376ca46a53dac3c26d221fc77f1 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:32:00 +0900 Subject: [PATCH 03/29] feat: clip attention replaced --- vllm/model_executor/models/clip.py | 76 +++++++++++++++++++++++++++++- 1 file changed, 75 insertions(+), 1 deletion(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 805ade39389d..d7276e59af8b 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -6,12 +6,13 @@ import torch.nn as nn from PIL import Image from transformers import CLIPVisionConfig -from transformers.models.clip.modeling_clip import CLIPAttention +from xformers import ops as xops from vllm.config import ModelConfig from vllm.inputs import LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader @@ -149,6 +150,79 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings +class CLIPAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + config: CLIPVisionConfig, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + "embed_dim must be divisible by num_heads " + f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads}).") + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.qkv_proj = QKVParallelLinear( + hidden_size=self.embed_dim, + head_size=self.head_dim, + total_num_heads=self.num_heads, + quant_config=quant_config, + ) + + self.out_proj = RowParallelLinear( + input_size=self.embed_dim, + output_size=self.embed_dim, + quant_config=quant_config, + ) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, + self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + ): + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + # get query proj + qkv_states, _ = self.qkv_proj(hidden_states) + query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) + + query_states = query_states * self.scale + + query_states = query_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + key_states = key_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + value_states = value_states.view(bsz, tgt_len, + self.num_heads_per_partition, + self.head_dim) + + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) + out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() + attn_output, _ = self.out_proj(out) + + return attn_output + + class CLIPMLP(nn.Module): def __init__(self, From 9a9af502feeae8ab1fb07e092fb057d1a4018a83 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:32:09 +0900 Subject: [PATCH 04/29] fix: style --- vllm/model_executor/models/blip.py | 29 ++++++++++++++-------------- vllm/model_executor/models/siglip.py | 20 ++++++++++--------- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index bc2529e4d1aa..98fd391cdd00 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -6,8 +6,10 @@ import torch.nn as nn from PIL import Image from transformers import Blip2VisionConfig, BlipVisionConfig +from xformers import ops as xops from vllm.config import ModelConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -17,8 +19,6 @@ from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import SequenceData -from vllm.distributed import divide, get_tensor_model_parallel_world_size -from xformers import ops as xops def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -157,7 +157,7 @@ class BlipAttention(nn.Module): def __init__( self, - config, + config: BlipVisionConfig, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() @@ -167,9 +167,9 @@ def __init__( self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) + "embed_dim must be divisible by num_heads " + f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads}).") self.scale = self.head_dim**-0.5 self.dropout = nn.Dropout(config.attention_dropout) @@ -184,12 +184,13 @@ def __init__( self.embed_dim, quant_config=quant_config, ) - + self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): - return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + return tensor.view(bsz, seq_len, self.num_heads, + self.head_dim).transpose(1, 2).contiguous() def forward( self, @@ -210,13 +211,11 @@ def forward( self.num_heads_per_partition, self.head_dim) - out = xops.memory_efficient_attention_forward( - query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale - ) + out = xops.memory_efficient_attention_forward(query_states, + key_states, + value_states, + p=self.dropout, + scale=self.scale) out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() attn_output, _ = self.out_proj(out) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index a29710110d4c..7db7332b797e 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -253,20 +253,22 @@ def forward( qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states.view(batch_size, q_len, self.num_heads_per_partition, + query_states = query_states.view(batch_size, q_len, + self.num_heads_per_partition, self.head_dim) - key_states = key_states.view(batch_size, q_len, self.num_heads_per_partition, + key_states = key_states.view(batch_size, q_len, + self.num_heads_per_partition, self.head_dim) - value_states = value_states.view(batch_size, q_len, self.num_heads_per_partition, + value_states = value_states.view(batch_size, q_len, + self.num_heads_per_partition, self.head_dim) out = xops.memory_efficient_attention_forward(query_states, - key_states, - value_states, - p=self.dropout, - scale=self.scale) - out = out.reshape(batch_size, q_len, - self.embed_dim).contiguous() + key_states, + value_states, + p=self.dropout, + scale=self.scale) + out = out.reshape(batch_size, q_len, self.embed_dim).contiguous() attn_output, _ = self.out_proj(out) return attn_output From 8176c8ed23bc50d12a106eb22f184e9254de4279 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:36:22 +0900 Subject: [PATCH 05/29] fix: provide qunatization config --- vllm/model_executor/models/blip.py | 2 +- vllm/model_executor/models/clip.py | 2 +- vllm/model_executor/models/siglip.py | 3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 98fd391cdd00..f7bf57d1c8ae 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -256,7 +256,7 @@ def __init__(self, quant_config: Optional[QuantizationConfig] = None): super().__init__() - self.self_attn = BlipAttention(config) + self.self_attn = BlipAttention(config, quant_config=quant_config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.mlp = BlipMLP(config, quant_config=quant_config) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index d7276e59af8b..2e711a5a77d2 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -255,7 +255,7 @@ def __init__(self, quant_config: Optional[QuantizationConfig] = None): super().__init__() - self.self_attn = CLIPAttention(config) + self.self_attn = CLIPAttention(config, quant_config=quant_config) self.layer_norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.mlp = CLIPMLP(config, quant_config=quant_config) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 7db7332b797e..afae822f9f43 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -316,8 +316,7 @@ def __init__( super().__init__() self.embed_dim = config.hidden_size - # TODO(ChristopherCho): use TP'ed Attention block - self.self_attn = SiglipAttention(config) + self.self_attn = SiglipAttention(config, quant_config=quant_config) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = SiglipMLP( From b3bdbef457d2948140dcfea006858c9d964237f6 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:37:58 +0900 Subject: [PATCH 06/29] fix: return value of attention --- vllm/model_executor/models/blip.py | 2 +- vllm/model_executor/models/clip.py | 2 +- vllm/model_executor/models/siglip.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index f7bf57d1c8ae..cffa9a8a1071 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -267,7 +267,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) hidden_states = residual + hidden_states residual = hidden_states diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 2e711a5a77d2..b57b0e2220ff 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -267,7 +267,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) hidden_states = residual + hidden_states residual = hidden_states diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index afae822f9f43..0a01d386180d 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -333,7 +333,7 @@ def forward( residual = hidden_states hidden_states = self.layer_norm1(hidden_states) - hidden_states, _ = self.self_attn(hidden_states=hidden_states) + hidden_states = self.self_attn(hidden_states=hidden_states) hidden_states = residual + hidden_states residual = hidden_states From 8927414339dd4b487124e051b9789bc38958ff72 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:46:55 +0900 Subject: [PATCH 07/29] fix: add tp config in clip attention --- vllm/model_executor/models/clip.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index b57b0e2220ff..1164086d174b 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -19,6 +19,8 @@ from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import SequenceData +from vllm.distributed import divide, get_tensor_model_parallel_world_size + def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -183,6 +185,9 @@ def __init__( output_size=self.embed_dim, quant_config=quant_config, ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, From 22a0a8450b1f21831711d09ba24efdaa39961634 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:47:15 +0900 Subject: [PATCH 08/29] feat: tp attention in intern vit --- vllm/model_executor/models/intern_vit.py | 66 +++++++++++++++++------- 1 file changed, 47 insertions(+), 19 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 54c933e3e495..d37e470847fc 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -14,9 +14,12 @@ from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, + QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.distributed import divide, get_tensor_model_parallel_world_size +from xformers import ops as xops NORM2FN = { 'rms_norm': RMSNorm, @@ -81,7 +84,11 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: class InternAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" - def __init__(self, config: PretrainedConfig): + def __init__( + self, + config: PretrainedConfig, + quant_config: Optional[QuantizationConfig] = None, + ): super().__init__() self.config = config self.embed_dim = config.hidden_size @@ -94,9 +101,13 @@ def __init__(self, config: PretrainedConfig): f' {self.num_heads}).') self.scale = self.head_dim**-0.5 - self.qkv = nn.Linear(self.embed_dim, - 3 * self.embed_dim, - bias=config.qkv_bias) + self.qkv = QKVParallelLinear( + self.embed_dim, + self.head_dim, + self.num_heads, + bias=config.qkv_bias, + quant_config=quant_config, + ) self.qk_normalization = config.qk_normalization @@ -104,25 +115,42 @@ def __init__(self, config: PretrainedConfig): self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps) - self.proj = nn.Linear(self.embed_dim, self.embed_dim) + self.proj = RowParallelLinear( + self.embed_dim, + self.embed_dim, + quant_config=quant_config, + ) + + self.tp_size = get_tensor_model_parallel_world_size() + self.num_heads_per_partition = divide(self.num_heads, self.tp_size) def forward(self, x): B, N, C = x.shape - qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, - C // self.num_heads).permute(2, 0, 3, 1, 4) - q, k, v = qkv.unbind(0) - + qkv, _ = self.qkv_proj(x) + q, k, v = qkv.chunk(3, dim=-1) + + q = q.view(B, N, self.num_heads_per_partition, self.head_dim) + k = k.view(B, N, self.num_heads_per_partition, self.head_dim) + v = v.view(B, N, self.num_heads_per_partition, self.head_dim) + if self.qk_normalization: - B_, H_, N_, D_ = q.shape - q = self.q_norm.forward_native(q.transpose(1, 2).flatten( - -2, -1)).view(B_, N_, H_, D_).transpose(1, 2) - k = self.k_norm.forward_native(k.transpose(1, 2).flatten( - -2, -1)).view(B_, N_, H_, D_).transpose(1, 2) - - x = F.scaled_dot_product_attention(q, k, v, scale=self.scale) - x = x.transpose(1, 2).reshape(B, N, C) - - x = self.proj(x) + B_, N_, H_, D_ = q.shape + q = self.q_norm.forward_native( + q.flatten(-2, -1) + ).view(B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten( + -2, -1)).view(B_, N_, H_, D_) + + x = xops.memory_efficient_attention_forward( + q, + k, + v, + p=self.dropout, + scale=self.scale, + ) + x = x.view(B, N, -1) + + x, _ = self.proj(x) return x From f6063b126bbc66f21ed0037170507f19282d1ac5 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 13:48:56 +0900 Subject: [PATCH 09/29] fix: style fix --- vllm/model_executor/models/clip.py | 5 ++--- vllm/model_executor/models/intern_vit.py | 21 ++++++++++----------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 1164086d174b..cf60ecec484f 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -9,6 +9,7 @@ from xformers import ops as xops from vllm.config import ModelConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.inputs import LLMInputs from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -19,8 +20,6 @@ from vllm.multimodal.image import (cached_get_tokenizer, repeat_and_pad_image_tokens) from vllm.sequence import SequenceData -from vllm.distributed import divide, get_tensor_model_parallel_world_size - def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int: @@ -185,7 +184,7 @@ def __init__( output_size=self.embed_dim, quant_config=quant_config, ) - + self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index d37e470847fc..0acc8b7baecb 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -10,7 +10,9 @@ import torch.nn as nn import torch.nn.functional as F from transformers import PretrainedConfig +from xformers import ops as xops +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.activation import get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.linear import (ColumnParallelLinear, @@ -18,8 +20,6 @@ RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig from vllm.model_executor.model_loader.weight_utils import default_weight_loader -from vllm.distributed import divide, get_tensor_model_parallel_world_size -from xformers import ops as xops NORM2FN = { 'rms_norm': RMSNorm, @@ -85,7 +85,7 @@ class InternAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( - self, + self, config: PretrainedConfig, quant_config: Optional[QuantizationConfig] = None, ): @@ -120,7 +120,7 @@ def __init__( self.embed_dim, quant_config=quant_config, ) - + self.tp_size = get_tensor_model_parallel_world_size() self.num_heads_per_partition = divide(self.num_heads, self.tp_size) @@ -128,18 +128,17 @@ def forward(self, x): B, N, C = x.shape qkv, _ = self.qkv_proj(x) q, k, v = qkv.chunk(3, dim=-1) - + q = q.view(B, N, self.num_heads_per_partition, self.head_dim) k = k.view(B, N, self.num_heads_per_partition, self.head_dim) v = v.view(B, N, self.num_heads_per_partition, self.head_dim) - + if self.qk_normalization: B_, N_, H_, D_ = q.shape - q = self.q_norm.forward_native( - q.flatten(-2, -1) - ).view(B_, N_, H_, D_) - k = self.k_norm.forward_native(k.flatten( - -2, -1)).view(B_, N_, H_, D_) + q = self.q_norm.forward_native(q.flatten(-2, + -1)).view(B_, N_, H_, D_) + k = self.k_norm.forward_native(k.flatten(-2, + -1)).view(B_, N_, H_, D_) x = xops.memory_efficient_attention_forward( q, From 8e54ef6b1955053d1e1954702ee41c6903c448d0 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 19:15:35 +0900 Subject: [PATCH 10/29] feat: weight loading for clip based models --- vllm/model_executor/models/clip.py | 24 +++++++++++---- vllm/model_executor/models/phi3v.py | 45 +++++++++++++++++------------ 2 files changed, 45 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index cf60ecec484f..6fcd3ca141a7 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -372,6 +372,14 @@ def device(self): return next(self.parameters()).device def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] params_dict = dict(self.named_parameters()) layer_count = len(self.vision_model.encoder.layers) @@ -384,8 +392,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): layer_idx = int(name.split(".")[3]) if layer_idx >= layer_count: continue - - param = params_dict[name] - weight_loader = getattr(param, "weight_loader", - default_weight_loader) - weight_loader(param, loaded_weight) + + for (param_name, weight_name, shard_id) in stacked_params_mapping: + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + weight_loader(param, loaded_weight) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 823c34b10187..265e8f726ff0 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -43,7 +43,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, input_processor_for_clip) from .interfaces import SupportsVision -from .utils import merge_vision_embeddings +from .utils import merge_vision_embeddings, filter_weights logger = init_logger(__name__) @@ -70,6 +70,23 @@ projection_dim=768) +def _init_img_processor(hf_config: PretrainedConfig): + clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG + layer_idx = hf_config.img_processor.get('layer_idx', -2) + + # Initialize the CLIP only up to the required feature layer + if layer_idx < 0: + num_hidden_layers = clip_config.num_hidden_layers + \ + layer_idx + 1 + else: + num_hidden_layers = layer_idx + 1 + + img_processor = CLIPVisionModel( + clip_config, num_hidden_layers_override=num_hidden_layers) + + return img_processor + + class Phi3ImageEmbeddingBase(nn.Module): def __init__(self) -> None: @@ -107,18 +124,8 @@ def __init__(self, config: PretrainedConfig) -> None: hidden_size = config.n_embd if hasattr( config, 'n_embd') else config.hidden_size - clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG - self.layer_idx = config.img_processor.get('layer_idx', -2) + self.img_processor = _init_img_processor(config) - # Initialize the CLIP only up to the required feature layer - if self.layer_idx < 0: - num_hidden_layers = clip_config.num_hidden_layers + \ - self.layer_idx + 1 - else: - num_hidden_layers = self.layer_idx + 1 - - self.img_processor = CLIPVisionModel( - clip_config, num_hidden_layers_override=num_hidden_layers) image_dim_out = config.img_processor['image_dim_out'] self.num_img_tokens = config.img_processor['num_img_tokens'] @@ -572,19 +579,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - # post_layernorm is not needed in CLIPVisionModel - if "vision_model.post_layernorm" in name: - continue for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) for (param_name, weight_name, shard_id) in stacked_params_mapping: - # We only do sharding for language model - # and not vision model for now. - if "vision_embed_tokens" in name and self.vision_embed_tokens: - continue if weight_name not in name: continue + + if "vision_embed_tokens.img_processor" in name: + continue + param = params_dict[name.replace(weight_name, param_name)] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -598,3 +602,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader = getattr(param, "weight_loader", default_weight_loader) weight_loader(param, loaded_weight) + + vision_weights = filter_weights(weights, "vision_embed_tokens") + self.vision_embed_tokens.img_processor.load_weights(vision_weights) From 87043e4f092616efdc520016db0088dbab484f3a Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 19:16:48 +0900 Subject: [PATCH 11/29] feat: weight loading for siglip based models --- vllm/model_executor/models/paligemma.py | 48 +++++++++++-------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index 9ba53b8b59a2..f5e31208e86f 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -278,34 +278,28 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if key_to_modify in name: name = name.replace(key_to_modify, new_key) use_default_weight_loading = False - if "vision" in name: - if self.vision_tower is not None: - # We only do sharding for language model and - # not vision model for now. - use_default_weight_loading = True + for (param_name, shard_name, + shard_id) in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break else: - for (param_name, shard_name, - shard_id) in stacked_params_mapping: - if shard_name not in name: - continue - name = name.replace(shard_name, param_name) - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - param = params_dict[name] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - # lm_head is not used in vllm as it is tied with - # embed_token. To prevent errors, skip loading - # lm_head.weight. - if "lm_head.weight" in name: - continue - # Skip loading extra bias for GPTQ models. - if name.endswith(".bias") and name not in params_dict: - continue - use_default_weight_loading = True + # lm_head is not used in vllm as it is tied with + # embed_token. To prevent errors, skip loading + # lm_head.weight. + if "lm_head.weight" in name: + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + use_default_weight_loading = True if use_default_weight_loading: param = params_dict[name] From 2c18f3a92058f51909848bfbbd203cd4c407d127 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 19:32:02 +0900 Subject: [PATCH 12/29] feat: weight loading for blip based models --- vllm/model_executor/models/blip2.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index e00e6c080695..5c0ebb60de94 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -646,22 +646,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if key_to_modify in name: name = name.replace(key_to_modify, new_key) use_default_weight_loading = False - if "vision" in name: - if self.vision_model is not None: - # We only do sharding for language model and - # not vision model for now. - use_default_weight_loading = True + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break else: - for (param_name, weight_name, - shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - param = params_dict[name.replace(weight_name, param_name)] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break - else: - use_default_weight_loading = True + use_default_weight_loading = True if use_default_weight_loading: param = params_dict[name] weight_loader = getattr(param, "weight_loader", From c6015f589b5c7bd36ee73f5e83dd1b30a1602fe7 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 19:54:26 +0900 Subject: [PATCH 13/29] fix: bug in clip weight loading --- vllm/model_executor/models/clip.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 6fcd3ca141a7..652f8c0290f1 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -377,9 +377,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): ("qkv_proj", "q_proj", "q"), ("qkv_proj", "k_proj", "k"), ("qkv_proj", "v_proj", "v"), - ("gate_up_proj", "gate_proj", 0), - ("gate_up_proj", "up_proj", 1), - ] + ] params_dict = dict(self.named_parameters()) layer_count = len(self.vision_model.encoder.layers) @@ -392,8 +390,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): layer_idx = int(name.split(".")[3]) if layer_idx >= layer_count: continue - + for (param_name, weight_name, shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) From be0e190e060df5819ea0c2f343e8f8ef32b92caf Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 20:08:05 +0900 Subject: [PATCH 14/29] fix: bug in clip attention --- vllm/model_executor/models/clip.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 652f8c0290f1..153cd34ae3e8 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -150,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings - +# TODO(ChristopherCho): Clip attention is not fully tested yet. class CLIPAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -198,14 +198,12 @@ def forward( ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, embed_dim = hidden_states.size() + bsz, tgt_len, _ = hidden_states.size() # get query proj qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) - query_states = query_states * self.scale - query_states = query_states.view(bsz, tgt_len, self.num_heads_per_partition, self.head_dim) From 01214454a73b2763d348a21b9b6d0dee2a481669 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 20:08:26 +0900 Subject: [PATCH 15/29] fix: bug in blip attention --- vllm/model_executor/models/blip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index cffa9a8a1071..fd767fa97620 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -171,7 +171,7 @@ def __init__( f"(got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads}).") self.scale = self.head_dim**-0.5 - self.dropout = nn.Dropout(config.attention_dropout) + self.dropout = config.attention_dropout self.qkv = QKVParallelLinear( self.embed_dim, @@ -217,7 +217,7 @@ def forward( p=self.dropout, scale=self.scale) out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() - attn_output, _ = self.out_proj(out) + attn_output, _ = self.projection(out) return attn_output From 414040faa2a23ca5de38226c544397eb8243a1e5 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Tue, 6 Aug 2024 20:08:43 +0900 Subject: [PATCH 16/29] fix: blip does not require sharding --- vllm/model_executor/models/blip2.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 5c0ebb60de94..b26d18570c7c 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -646,16 +646,23 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if key_to_modify in name: name = name.replace(key_to_modify, new_key) use_default_weight_loading = False - for (param_name, weight_name, - shard_id) in stacked_params_mapping: - if weight_name not in name: - continue - param = params_dict[name.replace(weight_name, param_name)] - weight_loader = param.weight_loader - weight_loader(param, loaded_weight, shard_id) - break + if "vision" in name: + if self.vision_model is not None: + # We only do sharding for language model and + # not vision model for now. + use_default_weight_loading = True else: - use_default_weight_loading = True + for (param_name, weight_name, + shard_id) in stacked_params_mapping: + if weight_name not in name: + continue + param = params_dict[name.replace(weight_name, param_name)] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + use_default_weight_loading = True + if use_default_weight_loading: param = params_dict[name] weight_loader = getattr(param, "weight_loader", From f28aec30d1cb49537a949cd142aadf81e356230a Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Wed, 7 Aug 2024 10:51:02 +0900 Subject: [PATCH 17/29] fix: style --- vllm/model_executor/models/clip.py | 4 +--- vllm/model_executor/models/paligemma.py | 3 +-- vllm/model_executor/models/phi3v.py | 6 +++--- vllm/model_executor/models/siglip.py | 2 +- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 153cd34ae3e8..e892f0bd9996 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -150,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: return embeddings -# TODO(ChristopherCho): Clip attention is not fully tested yet. + class CLIPAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -197,10 +197,8 @@ def forward( hidden_states: torch.Tensor, ): """Input shape: Batch x Time x Channel""" - bsz, tgt_len, _ = hidden_states.size() - # get query proj qkv_states, _ = self.qkv_proj(hidden_states) query_states, key_states, value_states = qkv_states.chunk(3, dim=-1) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index f5e31208e86f..a0126228b752 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -278,8 +278,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if key_to_modify in name: name = name.replace(key_to_modify, new_key) use_default_weight_loading = False - for (param_name, shard_name, - shard_id) in stacked_params_mapping: + for (param_name, shard_name, shard_id) in stacked_params_mapping: if shard_name not in name: continue name = name.replace(shard_name, param_name) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 265e8f726ff0..e2326cbfe2db 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -43,7 +43,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, input_processor_for_clip) from .interfaces import SupportsVision -from .utils import merge_vision_embeddings, filter_weights +from .utils import filter_weights, merge_vision_embeddings logger = init_logger(__name__) @@ -585,10 +585,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for (param_name, weight_name, shard_id) in stacked_params_mapping: if weight_name not in name: continue - + if "vision_embed_tokens.img_processor" in name: continue - + param = params_dict[name.replace(weight_name, param_name)] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 0a01d386180d..ec8b081894f4 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -7,7 +7,7 @@ import torch from PIL import Image from torch import nn -from transformers import SiglipVisionConfig, SiglipVisionConfig +from transformers import SiglipVisionConfig from xformers import ops as xops from vllm.config import ModelConfig From 734fcb17e926adcf6bead4916eb25b495c7d50c2 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Wed, 7 Aug 2024 11:23:23 +0900 Subject: [PATCH 18/29] fix: phi3v weight loading logic fixed --- vllm/model_executor/models/phi3v.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index e2326cbfe2db..3053ed055a32 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -43,7 +43,7 @@ from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, input_processor_for_clip) from .interfaces import SupportsVision -from .utils import filter_weights, merge_vision_embeddings +from .utils import merge_vision_embeddings logger = init_logger(__name__) @@ -575,6 +575,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): (".gate_up_proj", ".gate_proj", 0), (".gate_up_proj", ".up_proj", 1), ] + + # TODO(ChristopherCho): This is a temporary fix to load + # the vision weights with CLIPVisionModel.load_weights() + vision_weights = [] params_dict = dict(self.named_parameters()) for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: @@ -594,6 +598,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): weight_loader(param, loaded_weight, shard_id) break else: + if "vision_embed_tokens.img_processor" in name: + vision_weights.append((name, loaded_weight)) + continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue @@ -603,5 +610,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) weight_loader(param, loaded_weight) - vision_weights = filter_weights(weights, "vision_embed_tokens") + vision_weights = [(n.replace("vision_embed_tokens.img_processor.", + ""), w) for n, w in vision_weights] self.vision_embed_tokens.img_processor.load_weights(vision_weights) From f1329c9e8e1f53d3424107ccb91291737781e7e3 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Wed, 7 Aug 2024 13:33:33 +0900 Subject: [PATCH 19/29] fix: make intern vit working --- vllm/model_executor/models/intern_vit.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 0acc8b7baecb..2986ca31801c 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -126,7 +126,7 @@ def __init__( def forward(self, x): B, N, C = x.shape - qkv, _ = self.qkv_proj(x) + qkv, _ = self.qkv(x) q, k, v = qkv.chunk(3, dim=-1) q = q.view(B, N, self.num_heads_per_partition, self.head_dim) @@ -144,7 +144,6 @@ def forward(self, x): q, k, v, - p=self.dropout, scale=self.scale, ) x = x.view(B, N, -1) From c77666230a5a7d3eec2b5a167e5550c3144791db Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Wed, 7 Aug 2024 13:51:36 +0900 Subject: [PATCH 20/29] fix: fix for tp input --- vllm/model_executor/models/blip.py | 2 +- vllm/model_executor/models/clip.py | 2 +- vllm/model_executor/models/siglip.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index fd767fa97620..f525a4b396f6 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -216,7 +216,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() + out = out.reshape(bsz, tgt_len, -1).contiguous() attn_output, _ = self.projection(out) return attn_output diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index e892f0bd9996..9c210b475c00 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -217,7 +217,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous() + out = out.reshape(bsz, tgt_len, -1).contiguous() attn_output, _ = self.out_proj(out) return attn_output diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index ec8b081894f4..78db14a73909 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -268,7 +268,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(batch_size, q_len, self.embed_dim).contiguous() + out = out.reshape(batch_size, q_len, -1).contiguous() attn_output, _ = self.out_proj(out) return attn_output From 5ad0d22cb7bed51fd148cdca0d6d64d3967734dd Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Thu, 8 Aug 2024 10:18:02 +0900 Subject: [PATCH 21/29] fix: minor refactoring --- vllm/model_executor/models/blip2.py | 4 +--- vllm/model_executor/models/intern_vit.py | 2 +- vllm/model_executor/models/phi3v.py | 20 ++++++++++++-------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index b26d18570c7c..d9a40da600e7 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -648,8 +648,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): use_default_weight_loading = False if "vision" in name: if self.vision_model is not None: - # We only do sharding for language model and - # not vision model for now. + # BlipVisionModel does not need sharding use_default_weight_loading = True else: for (param_name, weight_name, @@ -662,7 +661,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): break else: use_default_weight_loading = True - if use_default_weight_loading: param = params_dict[name] weight_loader = getattr(param, "weight_loader", diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py index 2986ca31801c..ad5919150cad 100644 --- a/vllm/model_executor/models/intern_vit.py +++ b/vllm/model_executor/models/intern_vit.py @@ -187,7 +187,7 @@ def __init__(self, self.intermediate_size = config.intermediate_size self.norm_type = config.norm_type - self.attn = InternAttention(config) + self.attn = InternAttention(config, quant_config=quant_config) self.mlp = InternMLP(config, quant_config=quant_config) self.norm1 = NORM2FN[self.norm_type](self.embed_dim, eps=config.layer_norm_eps) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 3053ed055a32..4e462a0e1c3b 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -583,6 +583,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue + # Skip loading the img_processor weights since they are + # loaded separately. + if "vision_embed_tokens.img_processor" in name: + vision_weights.append((name, loaded_weight)) + continue + for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items(): if key_to_modify in name: name = name.replace(key_to_modify, new_key) @@ -590,17 +596,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): if weight_name not in name: continue - if "vision_embed_tokens.img_processor" in name: - continue - param = params_dict[name.replace(weight_name, param_name)] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) break else: - if "vision_embed_tokens.img_processor" in name: - vision_weights.append((name, loaded_weight)) - continue # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue @@ -610,6 +610,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): default_weight_loader) weight_loader(param, loaded_weight) - vision_weights = [(n.replace("vision_embed_tokens.img_processor.", - ""), w) for n, w in vision_weights] + # We use regex to extract the sub-module name + # from "model.vision_embed_tokens.img_processor.*" + vision_weights = [ + (re.search(r"vision_embed_tokens\.img_processor\.(.*)", + n).group(1), w) for n, w in vision_weights + ] self.vision_embed_tokens.img_processor.load_weights(vision_weights) From 70580a6134f28867c073062c69e218d367604398 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 27 Aug 2024 00:00:13 -0700 Subject: [PATCH 22/29] format --- vllm/model_executor/models/phi3v.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index a230e0c953b3..045bbdb5d8c5 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -87,6 +87,7 @@ def _init_img_processor(hf_config: PretrainedConfig): return img_processor + class Phi3VImagePixelInputs(TypedDict): type: Literal["pixel_values"] data: Union[torch.Tensor, List[torch.Tensor]] From 05932067013022175650c9972ecf73d461b67096 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Tue, 27 Aug 2024 23:06:54 -0700 Subject: [PATCH 23/29] cleanup TODO --- vllm/model_executor/models/paligemma.py | 1 - 1 file changed, 1 deletion(-) diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py index ac360991b963..a4c9eb9c9930 100644 --- a/vllm/model_executor/models/paligemma.py +++ b/vllm/model_executor/models/paligemma.py @@ -145,7 +145,6 @@ def __init__(self, self.config = config self.multimodal_config = multimodal_config - # TODO(ywang96): Port over SiglipVisionModel & TP self.vision_tower = SiglipVisionModel(config.vision_config) self.multi_modal_projector = PaliGemmaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, From 31af67338dcaf48e675c327a079ebf803ba8124e Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Fri, 30 Aug 2024 13:43:43 +0900 Subject: [PATCH 24/29] doc: Todo for adding prefix in clip load weights --- vllm/model_executor/models/clip.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index 68318cc10059..a83eb71df8a1 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -378,6 +378,8 @@ def forward(self, pixel_values: Optional[torch.Tensor] = None): def device(self): return next(self.parameters()).device + # (TODO) Add prefix argument for filtering out weights to be loaded + # ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986 def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): stacked_params_mapping = [ # (param_name, shard_name, shard_id) From 3652cb9744ee8fa01c34876f9a9ebc9dbf6db92c Mon Sep 17 00:00:00 2001 From: Jungho Christopher Cho Date: Fri, 30 Aug 2024 13:45:20 +0900 Subject: [PATCH 25/29] Fix: use view rather than reshape and contiguous Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- vllm/model_executor/models/blip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index 5c1ea1da264a..e4f17b6d8c69 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -220,7 +220,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(bsz, tgt_len, -1).contiguous() + out = out.view(bsz, tgt_len, -1) attn_output, _ = self.projection(out) return attn_output From 21a146d15d59597f9ffc7a75899ca09e224e75b3 Mon Sep 17 00:00:00 2001 From: Jungho Christopher Cho Date: Fri, 30 Aug 2024 13:45:32 +0900 Subject: [PATCH 26/29] Fix: use view rather than reshape and contiguous in clip Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- vllm/model_executor/models/clip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index ae0e59ee5653..239f75dfc95b 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -228,7 +228,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(bsz, tgt_len, -1).contiguous() + out = out.view(bsz, tgt_len, -1) attn_output, _ = self.out_proj(out) return attn_output From 2d1f639fecf97874f7c55fe5f24c4e2738587065 Mon Sep 17 00:00:00 2001 From: Jungho Christopher Cho Date: Fri, 30 Aug 2024 13:45:41 +0900 Subject: [PATCH 27/29] Fix: use view rather than reshape and contiguous in siglip Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com> --- vllm/model_executor/models/siglip.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index e70929c02c22..d29315bd1c3a 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -279,7 +279,7 @@ def forward( value_states, p=self.dropout, scale=self.scale) - out = out.reshape(batch_size, q_len, -1).contiguous() + out = out.view(batch_size, q_len, -1) attn_output, _ = self.out_proj(out) return attn_output From 659adc5ab5629564540cd6b9fccbb6ab9fae13e3 Mon Sep 17 00:00:00 2001 From: ChristopherCho Date: Fri, 30 Aug 2024 14:57:38 +0900 Subject: [PATCH 28/29] feat: option for disabling bias in blip --- vllm/model_executor/models/blip.py | 1 + 1 file changed, 1 insertion(+) diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py index e4f17b6d8c69..e6acf8cd5d5b 100644 --- a/vllm/model_executor/models/blip.py +++ b/vllm/model_executor/models/blip.py @@ -181,6 +181,7 @@ def __init__( self.embed_dim, self.head_dim, self.num_heads, + bias=config.qkv_bias, quant_config=quant_config, ) self.projection = RowParallelLinear( From ffb176b1a5ed7613f3e8194d82c84f6b0762d816 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Fri, 30 Aug 2024 02:06:51 -0700 Subject: [PATCH 29/29] patch internvl --- tests/models/test_intern_vit.py | 3 +- tests/models/test_internvl.py | 63 ++++++++++++++++----------------- 2 files changed, 32 insertions(+), 34 deletions(-) diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py index e980446ff357..816f846f69ba 100644 --- a/tests/models/test_intern_vit.py +++ b/tests/models/test_intern_vit.py @@ -6,8 +6,6 @@ from huggingface_hub import snapshot_download from transformers import AutoConfig, AutoModel, CLIPImageProcessor -from vllm.model_executor.models.intern_vit import InternVisionModel - from ..conftest import _ImageAssets, cleanup pytestmark = pytest.mark.vlm @@ -49,6 +47,7 @@ def run_intern_vit_test( for pixel_value in pixel_values ] + from vllm.model_executor.models.intern_vit import InternVisionModel vllm_model = InternVisionModel(config) vllm_model.load_weights(hf_model.state_dict().items()) diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index 243bc857c88d..42732cebc656 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -6,9 +6,6 @@ from PIL.Image import Image from transformers import AutoConfig -from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END, - IMG_START, - image_to_pixel_values) from vllm.multimodal.utils import rescale_image_size from vllm.utils import is_cpu @@ -33,35 +30,6 @@ ] -class InternVLProcessor: - """A simple processor for InternVL2 HF model which misses a processor.""" - - def __init__(self, hf_runner: HfRunner): - self.num_image_token = hf_runner.model.num_image_token - self.tokenizer = hf_runner.tokenizer - self.dtype = hf_runner.model.dtype - - self.config = AutoConfig.from_pretrained(hf_runner.model_name) - self.vision_config = self.config.vision_config - self.use_thumbnail = self.config.use_thumbnail - self.min_num = self.config.min_dynamic_patch - self.max_num = self.config.max_dynamic_patch - self.image_size = self.vision_config.image_size - - def __call__(self, text: str, images: Image, **kwargs): - pixel_values = image_to_pixel_values(images, self.image_size, - self.min_num, self.max_num, - self.use_thumbnail).to(self.dtype) - num_patches_list = [pixel_values.shape[0]] - for num_patches in num_patches_list: - context_tokens = IMG_CONTEXT * self.num_image_token * num_patches - image_tokens = IMG_START + context_tokens + IMG_END - text = text.replace('', image_tokens, 1) - prompt = self.tokenizer(text, return_tensors="pt") - prompt.update({"pixel_values": pixel_values}) - return prompt - - # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py def generate( self, @@ -127,6 +95,37 @@ def run_test( # if we run HF first, the cuda initialization will be done and it # will hurt multiprocessing backend with fork method (the default method). + class InternVLProcessor: + """A simple processor for InternVL2 which misses a processor.""" + + def __init__(self, hf_runner: HfRunner): + self.num_image_token = hf_runner.model.num_image_token + self.tokenizer = hf_runner.tokenizer + self.dtype = hf_runner.model.dtype + + self.config = AutoConfig.from_pretrained(hf_runner.model_name) + self.vision_config = self.config.vision_config + self.use_thumbnail = self.config.use_thumbnail + self.min_num = self.config.min_dynamic_patch + self.max_num = self.config.max_dynamic_patch + self.image_size = self.vision_config.image_size + + def __call__(self, text: str, images: Image, **kwargs): + from vllm.model_executor.models.internvl import ( + IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) + pixel_values = image_to_pixel_values( + images, self.image_size, self.min_num, self.max_num, + self.use_thumbnail).to(self.dtype) + num_patches_list = [pixel_values.shape[0]] + for num_patches in num_patches_list: + context_tokens = IMG_CONTEXT * self.num_image_token \ + * num_patches + image_tokens = IMG_START + context_tokens + IMG_END + text = text.replace('', image_tokens, 1) + prompt = self.tokenizer(text, return_tensors="pt") + prompt.update({"pixel_values": pixel_values}) + return prompt + # max_model_len should be greater than image_feature_size with vllm_runner(model, max_model_len=4096,