From 0919bd134576bfa4abf84f011596be28b710cea0 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 11:02:30 +0900
Subject: [PATCH 01/29] feat: replace siglipattention with tp'ed one

---
 vllm/model_executor/models/siglip.py | 204 ++++-----------------------
 1 file changed, 28 insertions(+), 176 deletions(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 5ba14f73394f..a29710110d4c 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -7,13 +7,11 @@
 import torch
 from PIL import Image
 from torch import nn
-from transformers import SiglipVisionConfig
-from transformers.models.siglip.modeling_siglip import SiglipAttention
-from vllm_flash_attn import flash_attn_func
-from xformers.ops import memory_efficient_attention
+from transformers import SiglipVisionConfig, SiglipVisionConfig
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
-from vllm.distributed import get_tensor_model_parallel_world_size
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -210,9 +208,7 @@ def forward(self,
         return embeddings
 
 
-# NOTE: Not used - kept for later when we TP the ViT
-# TODO(ChristopherCho): Implement TP version of Attention
-class SiglipTPAttention(nn.Module):
+class SiglipAttention(nn.Module):
 
     def __init__(
         self,
@@ -222,38 +218,30 @@ def __init__(
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
-
-        tp_size = get_tensor_model_parallel_world_size()
-        self.total_num_heads = config.num_attention_heads
-        if self.total_num_heads % tp_size != 0:
-            raise ValueError(
-                f"Number of attention heads ({self.total_num_heads}) "
-                "must be divisible by the tensor model parallel size"
-                f" ({tp_size}).")
-
-        self.num_heads = self.total_num_heads // tp_size
-        self.head_dim = self.embed_dim // self.total_num_heads
-        if self.head_dim * self.total_num_heads != self.embed_dim:
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(f"embed_dim must be divisible by num_heads (got "
                              "`embed_dim`: {self.embed_dim} and `num_heads`:"
                              f" {self.num_heads}).")
-        self.qkv_size = self.num_heads * self.head_dim
+
         self.scale = self.head_dim**-0.5
         self.dropout = config.attention_dropout
-
         self.qkv_proj = QKVParallelLinear(
             hidden_size=self.embed_dim,
             head_size=self.head_dim,
-            total_num_heads=self.total_num_heads,
+            total_num_heads=self.num_heads,
             quant_config=quant_config,
         )
+
         self.out_proj = RowParallelLinear(
             input_size=self.embed_dim,
             output_size=self.embed_dim,
             quant_config=quant_config,
         )
 
-        self.attn_fn = self._basic_attention_forward
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def forward(
         self,
@@ -263,163 +251,27 @@ def forward(
         batch_size, q_len, _ = hidden_states.size()
 
         qkv_states, _ = self.qkv_proj(hidden_states)
-        query_states, key_states, value_states = qkv_states.split(
-            [self.qkv_size] * 3, dim=-1)
-
-        attn_output = self.attn_fn(
-            q=query_states,
-            k=key_states,
-            v=value_states,
-            batch_size=batch_size,
-            q_len=q_len,
-        )
-
-        attn_output, _ = self.out_proj(attn_output)
-        return attn_output
-
-    def _basic_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        k = k.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        v = v.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-
-        k_v_seq_len = k.shape[-2]
-        attn_weights = torch.matmul(q, k.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (
-                batch_size,
-                self.num_heads,
-                q_len,
-                k_v_seq_len,
-        ):
-            raise ValueError(
-                "Attention weights should be of size "
-                f"{(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}")
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights,
-                                             dim=-1,
-                                             dtype=torch.float32).to(q.dtype)
-        attn_weights = nn.functional.dropout(attn_weights,
-                                             p=self.dropout,
-                                             training=self.training)
-        attn_output = torch.matmul(attn_weights, v)
-
-        if attn_output.size() != (
-                batch_size,
-                self.num_heads,
-                q_len,
-                self.head_dim,
-        ):
-            raise ValueError(
-                "`attn_output` should be of size "
-                f"{(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}")
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-# TODO(ChristopherCho): flash_attn_func is not working properly.
-#                       It constantly throws a CUDA error.
-class SiglipFlashAttention2(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.attn_fn = self._flash_attention_forward
-
-    # Ported from https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/models/siglip/modeling_siglip.py#L449
-    # and https://github.com/huggingface/transformers/blob/v4.43.3/src/transformers/modeling_flash_attention_utils.py#L133
-    def _flash_attention_forward(self, q, k, v, batch_size, q_len, *args,
-                                 **kwargs):
-        """Implements the multihead softmax attention.
-        Arguments
-        ---------
-            q, k, v: The tensor containing the
-                     query, key, and value. (B, S, H, D)
-        """
-
-        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
-        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
-        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        attn_output = flash_attn_func(
-            q,
-            k,
-            v,
-            dropout_p=self.dropout,
-            causal=False,
-        )
-
-        attn_output = attn_output.reshape(batch_size, q_len,
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(batch_size, q_len, self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(batch_size, q_len, self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                              key_states,
+                                                              value_states,
+                                                              p=self.dropout,
+                                                              scale=self.scale)
+        out = out.reshape(batch_size, q_len,
                                           self.embed_dim).contiguous()
+        attn_output, _ = self.out_proj(out)
 
         return attn_output
 
 
-# NOTE: Not used - kept for later when we TP the ViT
-class SiglipSdpaAttention(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.is_causal = False
-        self.attn_fn = self._sdpa_attention_forward
-
-    def _sdpa_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        k = k.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-        v = v.view(batch_size, q_len, self.num_heads,
-                   self.head_dim).transpose(1, 2)
-
-        attn_output = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, dropout_p=self.dropout, is_causal=False, scale=self.scale)
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.view(batch_size, q_len, self.embed_dim)
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-class SiglipxFormersAttention(SiglipTPAttention):
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.attn_fn = self._xformers_attention_forward
-
-    def _xformers_attention_forward(self, q, k, v, batch_size, q_len):
-        q = q.view(batch_size, q_len, self.num_heads, self.head_dim)
-        k = k.view(batch_size, q_len, self.num_heads, self.head_dim)
-        v = v.view(batch_size, q_len, self.num_heads, self.head_dim)
-
-        attn_output = memory_efficient_attention(q,
-                                                 k,
-                                                 v,
-                                                 p=0.0,
-                                                 scale=self.scale)
-        attn_output = attn_output.reshape(batch_size, q_len,
-                                          self.embed_dim).contiguous()
-
-        return attn_output
-
-
-# NOTE: Not used - kept for later when we TP the ViT
-SIGLIP_ATTENTION_CLASSES = {
-    "eager": SiglipTPAttention,
-    "flash_attention_2": SiglipFlashAttention2,
-    "sdpa": SiglipSdpaAttention,
-    "xformers": SiglipxFormersAttention,
-}
-
-
 class SiglipMLP(nn.Module):
 
     def __init__(

From 7cfc98c8b01063000b2b4c9431846853a3ae2b46 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:21:08 +0900
Subject: [PATCH 02/29] feat: tp blip attention

---
 vllm/model_executor/models/blip.py | 75 +++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 0b124d5e8a85..bc2529e4d1aa 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -6,17 +6,19 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
-from transformers.models.blip.modeling_blip import BlipAttention
 
 from vllm.config import ModelConfig
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from xformers import ops as xops
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -150,6 +152,77 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+class BlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            quant_config=quant_config,
+        )
+        self.projection = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+        )
+        
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+        bsz, tgt_len, _ = hidden_states.size()
+
+        qkv_states, _ = self.qkv(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+        query_states = query_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(bsz, tgt_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            p=self.dropout,
+            scale=self.scale
+        )
+        out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
+        attn_output, _ = self.out_proj(out)
+
+        return attn_output
+
+
 class BlipMLP(nn.Module):
 
     def __init__(self,

From 079c53fc17a58376ca46a53dac3c26d221fc77f1 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:32:00 +0900
Subject: [PATCH 03/29] feat: clip attention replaced

---
 vllm/model_executor/models/clip.py | 76 +++++++++++++++++++++++++++++-
 1 file changed, 75 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 805ade39389d..d7276e59af8b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -6,12 +6,13 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import CLIPVisionConfig
-from transformers.models.clip.modeling_clip import CLIPAttention
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
@@ -149,6 +150,79 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
         return embeddings
 
 
+class CLIPAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: CLIPVisionConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+
+        self.qkv_proj = QKVParallelLinear(
+            hidden_size=self.embed_dim,
+            head_size=self.head_dim,
+            total_num_heads=self.num_heads,
+            quant_config=quant_config,
+        )
+
+        self.out_proj = RowParallelLinear(
+            input_size=self.embed_dim,
+            output_size=self.embed_dim,
+            quant_config=quant_config,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+    ):
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        # get query proj
+        qkv_states, _ = self.qkv_proj(hidden_states)
+        query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
+
+        query_states = query_states * self.scale
+
+        query_states = query_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+        key_states = key_states.view(bsz, tgt_len,
+                                     self.num_heads_per_partition,
+                                     self.head_dim)
+        value_states = value_states.view(bsz, tgt_len,
+                                         self.num_heads_per_partition,
+                                         self.head_dim)
+
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
+        out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
+        attn_output, _ = self.out_proj(out)
+
+        return attn_output
+
+
 class CLIPMLP(nn.Module):
 
     def __init__(self,

From 9a9af502feeae8ab1fb07e092fb057d1a4018a83 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:32:09 +0900
Subject: [PATCH 04/29] fix: style

---
 vllm/model_executor/models/blip.py   | 29 ++++++++++++++--------------
 vllm/model_executor/models/siglip.py | 20 ++++++++++---------
 2 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index bc2529e4d1aa..98fd391cdd00 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -6,8 +6,10 @@
 import torch.nn as nn
 from PIL import Image
 from transformers import Blip2VisionConfig, BlipVisionConfig
+from xformers import ops as xops
 
 from vllm.config import ModelConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -17,8 +19,6 @@
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from xformers import ops as xops
 
 
 def get_blip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -157,7 +157,7 @@ class BlipAttention(nn.Module):
 
     def __init__(
         self,
-        config,
+        config: BlipVisionConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
@@ -167,9 +167,9 @@ def __init__(
         self.head_dim = self.embed_dim // self.num_heads
         if self.head_dim * self.num_heads != self.embed_dim:
             raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
+                "embed_dim must be divisible by num_heads "
+                f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads}).")
         self.scale = self.head_dim**-0.5
         self.dropout = nn.Dropout(config.attention_dropout)
 
@@ -184,12 +184,13 @@ def __init__(
             self.embed_dim,
             quant_config=quant_config,
         )
-        
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+        return tensor.view(bsz, seq_len, self.num_heads,
+                           self.head_dim).transpose(1, 2).contiguous()
 
     def forward(
         self,
@@ -210,13 +211,11 @@ def forward(
                                          self.num_heads_per_partition,
                                          self.head_dim)
 
-        out = xops.memory_efficient_attention_forward(
-            query_states,
-            key_states,
-            value_states,
-            p=self.dropout,
-            scale=self.scale
-        )
+        out = xops.memory_efficient_attention_forward(query_states,
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
         out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
         attn_output, _ = self.out_proj(out)
 
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index a29710110d4c..7db7332b797e 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -253,20 +253,22 @@ def forward(
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
 
-        query_states = query_states.view(batch_size, q_len, self.num_heads_per_partition,
+        query_states = query_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
                                          self.head_dim)
-        key_states = key_states.view(batch_size, q_len, self.num_heads_per_partition,
+        key_states = key_states.view(batch_size, q_len,
+                                     self.num_heads_per_partition,
                                      self.head_dim)
-        value_states = value_states.view(batch_size, q_len, self.num_heads_per_partition,
+        value_states = value_states.view(batch_size, q_len,
+                                         self.num_heads_per_partition,
                                          self.head_dim)
 
         out = xops.memory_efficient_attention_forward(query_states,
-                                                              key_states,
-                                                              value_states,
-                                                              p=self.dropout,
-                                                              scale=self.scale)
-        out = out.reshape(batch_size, q_len,
-                                          self.embed_dim).contiguous()
+                                                      key_states,
+                                                      value_states,
+                                                      p=self.dropout,
+                                                      scale=self.scale)
+        out = out.reshape(batch_size, q_len, self.embed_dim).contiguous()
         attn_output, _ = self.out_proj(out)
 
         return attn_output

From 8176c8ed23bc50d12a106eb22f184e9254de4279 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:36:22 +0900
Subject: [PATCH 05/29] fix: provide qunatization config

---
 vllm/model_executor/models/blip.py   | 2 +-
 vllm/model_executor/models/clip.py   | 2 +-
 vllm/model_executor/models/siglip.py | 3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 98fd391cdd00..f7bf57d1c8ae 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -256,7 +256,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = BlipAttention(config)
+        self.self_attn = BlipAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = BlipMLP(config, quant_config=quant_config)
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index d7276e59af8b..2e711a5a77d2 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -255,7 +255,7 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None):
         super().__init__()
 
-        self.self_attn = CLIPAttention(config)
+        self.self_attn = CLIPAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
         self.mlp = CLIPMLP(config, quant_config=quant_config)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 7db7332b797e..afae822f9f43 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -316,8 +316,7 @@ def __init__(
         super().__init__()
         self.embed_dim = config.hidden_size
 
-        # TODO(ChristopherCho): use TP'ed Attention block
-        self.self_attn = SiglipAttention(config)
+        self.self_attn = SiglipAttention(config, quant_config=quant_config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim,
                                         eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(

From b3bdbef457d2948140dcfea006858c9d964237f6 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:37:58 +0900
Subject: [PATCH 06/29] fix: return value of attention

---
 vllm/model_executor/models/blip.py   | 2 +-
 vllm/model_executor/models/clip.py   | 2 +-
 vllm/model_executor/models/siglip.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index f7bf57d1c8ae..cffa9a8a1071 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -267,7 +267,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 2e711a5a77d2..b57b0e2220ff 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -267,7 +267,7 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index afae822f9f43..0a01d386180d 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -333,7 +333,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, _ = self.self_attn(hidden_states=hidden_states)
+        hidden_states = self.self_attn(hidden_states=hidden_states)
         hidden_states = residual + hidden_states
 
         residual = hidden_states

From 8927414339dd4b487124e051b9789bc38958ff72 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:46:55 +0900
Subject: [PATCH 07/29] fix: add tp config in clip attention

---
 vllm/model_executor/models/clip.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index b57b0e2220ff..1164086d174b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -19,6 +19,8 @@
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -183,6 +185,9 @@ def __init__(
             output_size=self.embed_dim,
             quant_config=quant_config,
         )
+        
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads,

From 22a0a8450b1f21831711d09ba24efdaa39961634 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:47:15 +0900
Subject: [PATCH 08/29] feat: tp attention in intern vit

---
 vllm/model_executor/models/intern_vit.py | 66 +++++++++++++++++-------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 54c933e3e495..d37e470847fc 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -14,9 +14,12 @@
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
+                                               QKVParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
+from xformers import ops as xops
 
 NORM2FN = {
     'rms_norm': RMSNorm,
@@ -81,7 +84,11 @@ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
 class InternAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: PretrainedConfig):
+    def __init__(
+        self, 
+        config: PretrainedConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ):
         super().__init__()
         self.config = config
         self.embed_dim = config.hidden_size
@@ -94,9 +101,13 @@ def __init__(self, config: PretrainedConfig):
                 f' {self.num_heads}).')
 
         self.scale = self.head_dim**-0.5
-        self.qkv = nn.Linear(self.embed_dim,
-                             3 * self.embed_dim,
-                             bias=config.qkv_bias)
+        self.qkv = QKVParallelLinear(
+            self.embed_dim,
+            self.head_dim,
+            self.num_heads,
+            bias=config.qkv_bias,
+            quant_config=quant_config,
+        )
 
         self.qk_normalization = config.qk_normalization
 
@@ -104,25 +115,42 @@ def __init__(self, config: PretrainedConfig):
             self.q_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
             self.k_norm = RMSNorm(self.embed_dim, eps=config.layer_norm_eps)
 
-        self.proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.proj = RowParallelLinear(
+            self.embed_dim,
+            self.embed_dim,
+            quant_config=quant_config,
+        )
+        
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
     def forward(self, x):
         B, N, C = x.shape
-        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads,
-                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
-        q, k, v = qkv.unbind(0)
-
+        qkv, _ = self.qkv_proj(x)
+        q, k, v = qkv.chunk(3, dim=-1)
+        
+        q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
+        k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
+        v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
+        
         if self.qk_normalization:
-            B_, H_, N_, D_ = q.shape
-            q = self.q_norm.forward_native(q.transpose(1, 2).flatten(
-                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
-            k = self.k_norm.forward_native(k.transpose(1, 2).flatten(
-                -2, -1)).view(B_, N_, H_, D_).transpose(1, 2)
-
-        x = F.scaled_dot_product_attention(q, k, v, scale=self.scale)
-        x = x.transpose(1, 2).reshape(B, N, C)
-
-        x = self.proj(x)
+            B_, N_, H_, D_ = q.shape
+            q = self.q_norm.forward_native(
+                q.flatten(-2, -1)
+            ).view(B_, N_, H_, D_)
+            k = self.k_norm.forward_native(k.flatten(
+                -2, -1)).view(B_, N_, H_, D_)
+
+        x = xops.memory_efficient_attention_forward(
+            q,
+            k,
+            v,
+            p=self.dropout,
+            scale=self.scale,
+        )
+        x = x.view(B, N, -1)
+
+        x, _ = self.proj(x)
         return x
 
 

From f6063b126bbc66f21ed0037170507f19282d1ac5 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 13:48:56 +0900
Subject: [PATCH 09/29] fix: style fix

---
 vllm/model_executor/models/clip.py       |  5 ++---
 vllm/model_executor/models/intern_vit.py | 21 ++++++++++-----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 1164086d174b..cf60ecec484f 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -9,6 +9,7 @@
 from xformers import ops as xops
 
 from vllm.config import ModelConfig
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -19,8 +20,6 @@
 from vllm.multimodal.image import (cached_get_tokenizer,
                                    repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
-
 
 
 def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
@@ -185,7 +184,7 @@ def __init__(
             output_size=self.embed_dim,
             quant_config=quant_config,
         )
-        
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index d37e470847fc..0acc8b7baecb 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -10,7 +10,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import PretrainedConfig
+from xformers import ops as xops
 
+from vllm.distributed import divide, get_tensor_model_parallel_world_size
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.layernorm import RMSNorm
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
@@ -18,8 +20,6 @@
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
-from vllm.distributed import divide, get_tensor_model_parallel_world_size
-from xformers import ops as xops
 
 NORM2FN = {
     'rms_norm': RMSNorm,
@@ -85,7 +85,7 @@ class InternAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
     def __init__(
-        self, 
+        self,
         config: PretrainedConfig,
         quant_config: Optional[QuantizationConfig] = None,
     ):
@@ -120,7 +120,7 @@ def __init__(
             self.embed_dim,
             quant_config=quant_config,
         )
-        
+
         self.tp_size = get_tensor_model_parallel_world_size()
         self.num_heads_per_partition = divide(self.num_heads, self.tp_size)
 
@@ -128,18 +128,17 @@ def forward(self, x):
         B, N, C = x.shape
         qkv, _ = self.qkv_proj(x)
         q, k, v = qkv.chunk(3, dim=-1)
-        
+
         q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
         k = k.view(B, N, self.num_heads_per_partition, self.head_dim)
         v = v.view(B, N, self.num_heads_per_partition, self.head_dim)
-        
+
         if self.qk_normalization:
             B_, N_, H_, D_ = q.shape
-            q = self.q_norm.forward_native(
-                q.flatten(-2, -1)
-            ).view(B_, N_, H_, D_)
-            k = self.k_norm.forward_native(k.flatten(
-                -2, -1)).view(B_, N_, H_, D_)
+            q = self.q_norm.forward_native(q.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
+            k = self.k_norm.forward_native(k.flatten(-2,
+                                                     -1)).view(B_, N_, H_, D_)
 
         x = xops.memory_efficient_attention_forward(
             q,

From 8e54ef6b1955053d1e1954702ee41c6903c448d0 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 19:15:35 +0900
Subject: [PATCH 10/29] feat: weight loading for clip based models

---
 vllm/model_executor/models/clip.py  | 24 +++++++++++----
 vllm/model_executor/models/phi3v.py | 45 +++++++++++++++++------------
 2 files changed, 45 insertions(+), 24 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index cf60ecec484f..6fcd3ca141a7 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -372,6 +372,14 @@ def device(self):
         return next(self.parameters()).device
 
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]        
         params_dict = dict(self.named_parameters())
         layer_count = len(self.vision_model.encoder.layers)
 
@@ -384,8 +392,14 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 layer_idx = int(name.split(".")[3])
                 if layer_idx >= layer_count:
                     continue
-
-            param = params_dict[name]
-            weight_loader = getattr(param, "weight_loader",
-                                    default_weight_loader)
-            weight_loader(param, loaded_weight)
+            
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 823c34b10187..265e8f726ff0 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -43,7 +43,7 @@
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    input_processor_for_clip)
 from .interfaces import SupportsVision
-from .utils import merge_vision_embeddings
+from .utils import merge_vision_embeddings, filter_weights
 
 logger = init_logger(__name__)
 
@@ -70,6 +70,23 @@
                                                      projection_dim=768)
 
 
+def _init_img_processor(hf_config: PretrainedConfig):
+    clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
+    layer_idx = hf_config.img_processor.get('layer_idx', -2)
+
+    # Initialize the CLIP only up to the required feature layer
+    if layer_idx < 0:
+        num_hidden_layers = clip_config.num_hidden_layers + \
+            layer_idx + 1
+    else:
+        num_hidden_layers = layer_idx + 1
+
+    img_processor = CLIPVisionModel(
+        clip_config, num_hidden_layers_override=num_hidden_layers)
+
+    return img_processor
+
+
 class Phi3ImageEmbeddingBase(nn.Module):
 
     def __init__(self) -> None:
@@ -107,18 +124,8 @@ def __init__(self, config: PretrainedConfig) -> None:
         hidden_size = config.n_embd if hasattr(
             config, 'n_embd') else config.hidden_size
 
-        clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
-        self.layer_idx = config.img_processor.get('layer_idx', -2)
+        self.img_processor = _init_img_processor(config)
 
-        # Initialize the CLIP only up to the required feature layer
-        if self.layer_idx < 0:
-            num_hidden_layers = clip_config.num_hidden_layers + \
-                self.layer_idx + 1
-        else:
-            num_hidden_layers = self.layer_idx + 1
-
-        self.img_processor = CLIPVisionModel(
-            clip_config, num_hidden_layers_override=num_hidden_layers)
         image_dim_out = config.img_processor['image_dim_out']
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
@@ -572,19 +579,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
-            # post_layernorm is not needed in CLIPVisionModel
-            if "vision_model.post_layernorm" in name:
-                continue
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
-                # We only do sharding for language model
-                # and not vision model for now.
-                if "vision_embed_tokens" in name and self.vision_embed_tokens:
-                    continue
                 if weight_name not in name:
                     continue
+                
+                if "vision_embed_tokens.img_processor" in name:
+                    continue
+                
                 param = params_dict[name.replace(weight_name, param_name)]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
@@ -598,3 +602,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     weight_loader = getattr(param, "weight_loader",
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
+
+        vision_weights = filter_weights(weights, "vision_embed_tokens")
+        self.vision_embed_tokens.img_processor.load_weights(vision_weights)

From 87043e4f092616efdc520016db0088dbab484f3a Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 19:16:48 +0900
Subject: [PATCH 11/29] feat: weight loading for siglip based models

---
 vllm/model_executor/models/paligemma.py | 48 +++++++++++--------------
 1 file changed, 21 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 9ba53b8b59a2..f5e31208e86f 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -278,34 +278,28 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            if "vision" in name:
-                if self.vision_tower is not None:
-                    # We only do sharding for language model and
-                    # not vision model for now.
-                    use_default_weight_loading = True
+            for (param_name, shard_name,
+                    shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
             else:
-                for (param_name, shard_name,
-                     shard_id) in stacked_params_mapping:
-                    if shard_name not in name:
-                        continue
-                    name = name.replace(shard_name, param_name)
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    param = params_dict[name]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    # lm_head is not used in vllm as it is tied with
-                    # embed_token. To prevent errors, skip loading
-                    # lm_head.weight.
-                    if "lm_head.weight" in name:
-                        continue
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    use_default_weight_loading = True
+                # lm_head is not used in vllm as it is tied with
+                # embed_token. To prevent errors, skip loading
+                # lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                use_default_weight_loading = True
 
             if use_default_weight_loading:
                 param = params_dict[name]

From 2c18f3a92058f51909848bfbbd203cd4c407d127 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 19:32:02 +0900
Subject: [PATCH 12/29] feat: weight loading for blip based models

---
 vllm/model_executor/models/blip2.py | 24 +++++++++---------------
 1 file changed, 9 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index e00e6c080695..5c0ebb60de94 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -646,22 +646,16 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            if "vision" in name:
-                if self.vision_model is not None:
-                    # We only do sharding for language model and
-                    # not vision model for now.
-                    use_default_weight_loading = True
+            for (param_name, weight_name,
+                    shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+                param = params_dict[name.replace(weight_name, param_name)]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
             else:
-                for (param_name, weight_name,
-                     shard_id) in stacked_params_mapping:
-                    if weight_name not in name:
-                        continue
-                    param = params_dict[name.replace(weight_name, param_name)]
-                    weight_loader = param.weight_loader
-                    weight_loader(param, loaded_weight, shard_id)
-                    break
-                else:
-                    use_default_weight_loading = True
+                use_default_weight_loading = True
             if use_default_weight_loading:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",

From c6015f589b5c7bd36ee73f5e83dd1b30a1602fe7 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 19:54:26 +0900
Subject: [PATCH 13/29] fix: bug in clip weight loading

---
 vllm/model_executor/models/clip.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 6fcd3ca141a7..652f8c0290f1 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -377,9 +377,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             ("qkv_proj", "q_proj", "q"),
             ("qkv_proj", "k_proj", "k"),
             ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]        
+        ]
         params_dict = dict(self.named_parameters())
         layer_count = len(self.vision_model.encoder.layers)
 
@@ -392,8 +390,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 layer_idx = int(name.split(".")[3])
                 if layer_idx >= layer_count:
                     continue
-            
+
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                if weight_name not in name:
+                    continue
+
                 param = params_dict[name.replace(weight_name, param_name)]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)

From be0e190e060df5819ea0c2f343e8f8ef32b92caf Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 20:08:05 +0900
Subject: [PATCH 14/29] fix: bug in clip attention

---
 vllm/model_executor/models/clip.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 652f8c0290f1..153cd34ae3e8 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -150,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         return embeddings
 
-
+# TODO(ChristopherCho): Clip attention is not fully tested yet.
 class CLIPAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -198,14 +198,12 @@ def forward(
     ):
         """Input shape: Batch x Time x Channel"""
 
-        bsz, tgt_len, embed_dim = hidden_states.size()
+        bsz, tgt_len, _ = hidden_states.size()
 
         # get query proj
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
 
-        query_states = query_states * self.scale
-
         query_states = query_states.view(bsz, tgt_len,
                                          self.num_heads_per_partition,
                                          self.head_dim)

From 01214454a73b2763d348a21b9b6d0dee2a481669 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 20:08:26 +0900
Subject: [PATCH 15/29] fix: bug in blip attention

---
 vllm/model_executor/models/blip.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index cffa9a8a1071..fd767fa97620 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -171,7 +171,7 @@ def __init__(
                 f"(got `embed_dim`: {self.embed_dim} and `num_heads`:"
                 f" {self.num_heads}).")
         self.scale = self.head_dim**-0.5
-        self.dropout = nn.Dropout(config.attention_dropout)
+        self.dropout = config.attention_dropout
 
         self.qkv = QKVParallelLinear(
             self.embed_dim,
@@ -217,7 +217,7 @@ def forward(
                                                       p=self.dropout,
                                                       scale=self.scale)
         out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
-        attn_output, _ = self.out_proj(out)
+        attn_output, _ = self.projection(out)
 
         return attn_output
 

From 414040faa2a23ca5de38226c544397eb8243a1e5 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Tue, 6 Aug 2024 20:08:43 +0900
Subject: [PATCH 16/29] fix: blip does not require sharding

---
 vllm/model_executor/models/blip2.py | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index 5c0ebb60de94..b26d18570c7c 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -646,16 +646,23 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            for (param_name, weight_name,
-                    shard_id) in stacked_params_mapping:
-                if weight_name not in name:
-                    continue
-                param = params_dict[name.replace(weight_name, param_name)]
-                weight_loader = param.weight_loader
-                weight_loader(param, loaded_weight, shard_id)
-                break
+            if "vision" in name:
+                if self.vision_model is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
             else:
-                use_default_weight_loading = True
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+
             if use_default_weight_loading:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",

From f28aec30d1cb49537a949cd142aadf81e356230a Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Wed, 7 Aug 2024 10:51:02 +0900
Subject: [PATCH 17/29] fix: style

---
 vllm/model_executor/models/clip.py      | 4 +---
 vllm/model_executor/models/paligemma.py | 3 +--
 vllm/model_executor/models/phi3v.py     | 6 +++---
 vllm/model_executor/models/siglip.py    | 2 +-
 4 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 153cd34ae3e8..e892f0bd9996 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -150,7 +150,7 @@ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
 
         return embeddings
 
-# TODO(ChristopherCho): Clip attention is not fully tested yet.
+
 class CLIPAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
@@ -197,10 +197,8 @@ def forward(
         hidden_states: torch.Tensor,
     ):
         """Input shape: Batch x Time x Channel"""
-
         bsz, tgt_len, _ = hidden_states.size()
 
-        # get query proj
         qkv_states, _ = self.qkv_proj(hidden_states)
         query_states, key_states, value_states = qkv_states.chunk(3, dim=-1)
 
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index f5e31208e86f..a0126228b752 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -278,8 +278,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
             use_default_weight_loading = False
-            for (param_name, shard_name,
-                    shard_id) in stacked_params_mapping:
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
                 if shard_name not in name:
                     continue
                 name = name.replace(shard_name, param_name)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 265e8f726ff0..e2326cbfe2db 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -43,7 +43,7 @@
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    input_processor_for_clip)
 from .interfaces import SupportsVision
-from .utils import merge_vision_embeddings, filter_weights
+from .utils import filter_weights, merge_vision_embeddings
 
 logger = init_logger(__name__)
 
@@ -585,10 +585,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             for (param_name, weight_name, shard_id) in stacked_params_mapping:
                 if weight_name not in name:
                     continue
-                
+
                 if "vision_embed_tokens.img_processor" in name:
                     continue
-                
+
                 param = params_dict[name.replace(weight_name, param_name)]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index 0a01d386180d..ec8b081894f4 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -7,7 +7,7 @@
 import torch
 from PIL import Image
 from torch import nn
-from transformers import SiglipVisionConfig, SiglipVisionConfig
+from transformers import SiglipVisionConfig
 from xformers import ops as xops
 
 from vllm.config import ModelConfig

From 734fcb17e926adcf6bead4916eb25b495c7d50c2 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Wed, 7 Aug 2024 11:23:23 +0900
Subject: [PATCH 18/29] fix: phi3v weight loading logic fixed

---
 vllm/model_executor/models/phi3v.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e2326cbfe2db..3053ed055a32 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -43,7 +43,7 @@
 from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    input_processor_for_clip)
 from .interfaces import SupportsVision
-from .utils import filter_weights, merge_vision_embeddings
+from .utils import merge_vision_embeddings
 
 logger = init_logger(__name__)
 
@@ -575,6 +575,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             (".gate_up_proj", ".gate_proj", 0),
             (".gate_up_proj", ".up_proj", 1),
         ]
+
+        # TODO(ChristopherCho): This is a temporary fix to load
+        #     the vision weights with CLIPVisionModel.load_weights()
+        vision_weights = []
         params_dict = dict(self.named_parameters())
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
@@ -594,6 +598,9 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
+                if "vision_embed_tokens.img_processor" in name:
+                    vision_weights.append((name, loaded_weight))
+                    continue
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -603,5 +610,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
 
-        vision_weights = filter_weights(weights, "vision_embed_tokens")
+        vision_weights = [(n.replace("vision_embed_tokens.img_processor.",
+                                     ""), w) for n, w in vision_weights]
         self.vision_embed_tokens.img_processor.load_weights(vision_weights)

From f1329c9e8e1f53d3424107ccb91291737781e7e3 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Wed, 7 Aug 2024 13:33:33 +0900
Subject: [PATCH 19/29] fix: make intern vit working

---
 vllm/model_executor/models/intern_vit.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 0acc8b7baecb..2986ca31801c 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -126,7 +126,7 @@ def __init__(
 
     def forward(self, x):
         B, N, C = x.shape
-        qkv, _ = self.qkv_proj(x)
+        qkv, _ = self.qkv(x)
         q, k, v = qkv.chunk(3, dim=-1)
 
         q = q.view(B, N, self.num_heads_per_partition, self.head_dim)
@@ -144,7 +144,6 @@ def forward(self, x):
             q,
             k,
             v,
-            p=self.dropout,
             scale=self.scale,
         )
         x = x.view(B, N, -1)

From c77666230a5a7d3eec2b5a167e5550c3144791db Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Wed, 7 Aug 2024 13:51:36 +0900
Subject: [PATCH 20/29] fix: fix for tp input

---
 vllm/model_executor/models/blip.py   | 2 +-
 vllm/model_executor/models/clip.py   | 2 +-
 vllm/model_executor/models/siglip.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index fd767fa97620..f525a4b396f6 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -216,7 +216,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
+        out = out.reshape(bsz, tgt_len, -1).contiguous()
         attn_output, _ = self.projection(out)
 
         return attn_output
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index e892f0bd9996..9c210b475c00 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -217,7 +217,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(bsz, tgt_len, self.embed_dim).contiguous()
+        out = out.reshape(bsz, tgt_len, -1).contiguous()
         attn_output, _ = self.out_proj(out)
 
         return attn_output
diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index ec8b081894f4..78db14a73909 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -268,7 +268,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(batch_size, q_len, self.embed_dim).contiguous()
+        out = out.reshape(batch_size, q_len, -1).contiguous()
         attn_output, _ = self.out_proj(out)
 
         return attn_output

From 5ad0d22cb7bed51fd148cdca0d6d64d3967734dd Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Thu, 8 Aug 2024 10:18:02 +0900
Subject: [PATCH 21/29] fix: minor refactoring

---
 vllm/model_executor/models/blip2.py      |  4 +---
 vllm/model_executor/models/intern_vit.py |  2 +-
 vllm/model_executor/models/phi3v.py      | 20 ++++++++++++--------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py
index b26d18570c7c..d9a40da600e7 100644
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -648,8 +648,7 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
             use_default_weight_loading = False
             if "vision" in name:
                 if self.vision_model is not None:
-                    # We only do sharding for language model and
-                    # not vision model for now.
+                    # BlipVisionModel does not need sharding
                     use_default_weight_loading = True
             else:
                 for (param_name, weight_name,
@@ -662,7 +661,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                     break
                 else:
                     use_default_weight_loading = True
-
             if use_default_weight_loading:
                 param = params_dict[name]
                 weight_loader = getattr(param, "weight_loader",
diff --git a/vllm/model_executor/models/intern_vit.py b/vllm/model_executor/models/intern_vit.py
index 2986ca31801c..ad5919150cad 100644
--- a/vllm/model_executor/models/intern_vit.py
+++ b/vllm/model_executor/models/intern_vit.py
@@ -187,7 +187,7 @@ def __init__(self,
         self.intermediate_size = config.intermediate_size
         self.norm_type = config.norm_type
 
-        self.attn = InternAttention(config)
+        self.attn = InternAttention(config, quant_config=quant_config)
         self.mlp = InternMLP(config, quant_config=quant_config)
         self.norm1 = NORM2FN[self.norm_type](self.embed_dim,
                                              eps=config.layer_norm_eps)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3053ed055a32..4e462a0e1c3b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -583,6 +583,12 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         for name, loaded_weight in weights:
             if "rotary_emb.inv_freq" in name:
                 continue
+            # Skip loading the img_processor weights since they are
+            # loaded separately.
+            if "vision_embed_tokens.img_processor" in name:
+                vision_weights.append((name, loaded_weight))
+                continue
+
             for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
                 if key_to_modify in name:
                     name = name.replace(key_to_modify, new_key)
@@ -590,17 +596,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                 if weight_name not in name:
                     continue
 
-                if "vision_embed_tokens.img_processor" in name:
-                    continue
-
                 param = params_dict[name.replace(weight_name, param_name)]
                 weight_loader = param.weight_loader
                 weight_loader(param, loaded_weight, shard_id)
                 break
             else:
-                if "vision_embed_tokens.img_processor" in name:
-                    vision_weights.append((name, loaded_weight))
-                    continue
                 # Skip loading extra bias for GPTQ models.
                 if name.endswith(".bias") and name not in params_dict:
                     continue
@@ -610,6 +610,10 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
                                             default_weight_loader)
                     weight_loader(param, loaded_weight)
 
-        vision_weights = [(n.replace("vision_embed_tokens.img_processor.",
-                                     ""), w) for n, w in vision_weights]
+        # We use regex to extract the sub-module name
+        # from "model.vision_embed_tokens.img_processor.*"
+        vision_weights = [
+            (re.search(r"vision_embed_tokens\.img_processor\.(.*)",
+                       n).group(1), w) for n, w in vision_weights
+        ]
         self.vision_embed_tokens.img_processor.load_weights(vision_weights)

From 70580a6134f28867c073062c69e218d367604398 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Tue, 27 Aug 2024 00:00:13 -0700
Subject: [PATCH 22/29] format

---
 vllm/model_executor/models/phi3v.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index a230e0c953b3..045bbdb5d8c5 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -87,6 +87,7 @@ def _init_img_processor(hf_config: PretrainedConfig):
 
     return img_processor
 
+
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: Union[torch.Tensor, List[torch.Tensor]]

From 05932067013022175650c9972ecf73d461b67096 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Tue, 27 Aug 2024 23:06:54 -0700
Subject: [PATCH 23/29] cleanup TODO

---
 vllm/model_executor/models/paligemma.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index ac360991b963..a4c9eb9c9930 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -145,7 +145,6 @@ def __init__(self,
         self.config = config
         self.multimodal_config = multimodal_config
 
-        # TODO(ywang96): Port over SiglipVisionModel & TP
         self.vision_tower = SiglipVisionModel(config.vision_config)
         self.multi_modal_projector = PaliGemmaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,

From 31af67338dcaf48e675c327a079ebf803ba8124e Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Fri, 30 Aug 2024 13:43:43 +0900
Subject: [PATCH 24/29] doc: Todo for adding prefix in clip load weights

---
 vllm/model_executor/models/clip.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 68318cc10059..a83eb71df8a1 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -378,6 +378,8 @@ def forward(self, pixel_values: Optional[torch.Tensor] = None):
     def device(self):
         return next(self.parameters()).device
 
+    # (TODO) Add prefix argument for filtering out weights to be loaded
+    #        ref: https://github.com/vllm-project/vllm/pull/7186#discussion_r1734163986
     def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)

From 3652cb9744ee8fa01c34876f9a9ebc9dbf6db92c Mon Sep 17 00:00:00 2001
From: Jungho Christopher Cho <wjdgh6655@gmail.com>
Date: Fri, 30 Aug 2024 13:45:20 +0900
Subject: [PATCH 25/29] Fix: use view rather than reshape and contiguous

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/model_executor/models/blip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index 5c1ea1da264a..e4f17b6d8c69 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -220,7 +220,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(bsz, tgt_len, -1).contiguous()
+        out = out.view(bsz, tgt_len, -1)
         attn_output, _ = self.projection(out)
 
         return attn_output

From 21a146d15d59597f9ffc7a75899ca09e224e75b3 Mon Sep 17 00:00:00 2001
From: Jungho Christopher Cho <wjdgh6655@gmail.com>
Date: Fri, 30 Aug 2024 13:45:32 +0900
Subject: [PATCH 26/29] Fix: use view rather than reshape and contiguous in
 clip

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/model_executor/models/clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index ae0e59ee5653..239f75dfc95b 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -228,7 +228,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(bsz, tgt_len, -1).contiguous()
+        out = out.view(bsz, tgt_len, -1)
         attn_output, _ = self.out_proj(out)
 
         return attn_output

From 2d1f639fecf97874f7c55fe5f24c4e2738587065 Mon Sep 17 00:00:00 2001
From: Jungho Christopher Cho <wjdgh6655@gmail.com>
Date: Fri, 30 Aug 2024 13:45:41 +0900
Subject: [PATCH 27/29] Fix: use view rather than reshape and contiguous in
 siglip

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 vllm/model_executor/models/siglip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py
index e70929c02c22..d29315bd1c3a 100644
--- a/vllm/model_executor/models/siglip.py
+++ b/vllm/model_executor/models/siglip.py
@@ -279,7 +279,7 @@ def forward(
                                                       value_states,
                                                       p=self.dropout,
                                                       scale=self.scale)
-        out = out.reshape(batch_size, q_len, -1).contiguous()
+        out = out.view(batch_size, q_len, -1)
         attn_output, _ = self.out_proj(out)
 
         return attn_output

From 659adc5ab5629564540cd6b9fccbb6ab9fae13e3 Mon Sep 17 00:00:00 2001
From: ChristopherCho <wjdgh6655@gmail.com>
Date: Fri, 30 Aug 2024 14:57:38 +0900
Subject: [PATCH 28/29] feat: option for disabling bias in blip

---
 vllm/model_executor/models/blip.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/blip.py b/vllm/model_executor/models/blip.py
index e4f17b6d8c69..e6acf8cd5d5b 100644
--- a/vllm/model_executor/models/blip.py
+++ b/vllm/model_executor/models/blip.py
@@ -181,6 +181,7 @@ def __init__(
             self.embed_dim,
             self.head_dim,
             self.num_heads,
+            bias=config.qkv_bias,
             quant_config=quant_config,
         )
         self.projection = RowParallelLinear(

From ffb176b1a5ed7613f3e8194d82c84f6b0762d816 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 30 Aug 2024 02:06:51 -0700
Subject: [PATCH 29/29] patch internvl

---
 tests/models/test_intern_vit.py |  3 +-
 tests/models/test_internvl.py   | 63 ++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 34 deletions(-)

diff --git a/tests/models/test_intern_vit.py b/tests/models/test_intern_vit.py
index e980446ff357..816f846f69ba 100644
--- a/tests/models/test_intern_vit.py
+++ b/tests/models/test_intern_vit.py
@@ -6,8 +6,6 @@
 from huggingface_hub import snapshot_download
 from transformers import AutoConfig, AutoModel, CLIPImageProcessor
 
-from vllm.model_executor.models.intern_vit import InternVisionModel
-
 from ..conftest import _ImageAssets, cleanup
 
 pytestmark = pytest.mark.vlm
@@ -49,6 +47,7 @@ def run_intern_vit_test(
         for pixel_value in pixel_values
     ]
 
+    from vllm.model_executor.models.intern_vit import InternVisionModel
     vllm_model = InternVisionModel(config)
     vllm_model.load_weights(hf_model.state_dict().items())
 
diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py
index 243bc857c88d..42732cebc656 100644
--- a/tests/models/test_internvl.py
+++ b/tests/models/test_internvl.py
@@ -6,9 +6,6 @@
 from PIL.Image import Image
 from transformers import AutoConfig
 
-from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
-                                                 IMG_START,
-                                                 image_to_pixel_values)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
 
@@ -33,35 +30,6 @@
 ]
 
 
-class InternVLProcessor:
-    """A simple processor for InternVL2 HF model which misses a processor."""
-
-    def __init__(self, hf_runner: HfRunner):
-        self.num_image_token = hf_runner.model.num_image_token
-        self.tokenizer = hf_runner.tokenizer
-        self.dtype = hf_runner.model.dtype
-
-        self.config = AutoConfig.from_pretrained(hf_runner.model_name)
-        self.vision_config = self.config.vision_config
-        self.use_thumbnail = self.config.use_thumbnail
-        self.min_num = self.config.min_dynamic_patch
-        self.max_num = self.config.max_dynamic_patch
-        self.image_size = self.vision_config.image_size
-
-    def __call__(self, text: str, images: Image, **kwargs):
-        pixel_values = image_to_pixel_values(images, self.image_size,
-                                             self.min_num, self.max_num,
-                                             self.use_thumbnail).to(self.dtype)
-        num_patches_list = [pixel_values.shape[0]]
-        for num_patches in num_patches_list:
-            context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
-            image_tokens = IMG_START + context_tokens + IMG_END
-            text = text.replace('<image>', image_tokens, 1)
-        prompt = self.tokenizer(text, return_tensors="pt")
-        prompt.update({"pixel_values": pixel_values})
-        return prompt
-
-
 # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B/blob/main/modeling_internvl_chat.py
 def generate(
     self,
@@ -127,6 +95,37 @@ def run_test(
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
+    class InternVLProcessor:
+        """A simple processor for InternVL2 which misses a processor."""
+
+        def __init__(self, hf_runner: HfRunner):
+            self.num_image_token = hf_runner.model.num_image_token
+            self.tokenizer = hf_runner.tokenizer
+            self.dtype = hf_runner.model.dtype
+
+            self.config = AutoConfig.from_pretrained(hf_runner.model_name)
+            self.vision_config = self.config.vision_config
+            self.use_thumbnail = self.config.use_thumbnail
+            self.min_num = self.config.min_dynamic_patch
+            self.max_num = self.config.max_dynamic_patch
+            self.image_size = self.vision_config.image_size
+
+        def __call__(self, text: str, images: Image, **kwargs):
+            from vllm.model_executor.models.internvl import (
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+            pixel_values = image_to_pixel_values(
+                images, self.image_size, self.min_num, self.max_num,
+                self.use_thumbnail).to(self.dtype)
+            num_patches_list = [pixel_values.shape[0]]
+            for num_patches in num_patches_list:
+                context_tokens = IMG_CONTEXT * self.num_image_token \
+                    * num_patches
+                image_tokens = IMG_START + context_tokens + IMG_END
+                text = text.replace('<image>', image_tokens, 1)
+            prompt = self.tokenizer(text, return_tensors="pt")
+            prompt.update({"pixel_values": pixel_values})
+            return prompt
+
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model,
                      max_model_len=4096,