From 34bfa7935d4e67abe9309203e692fc46812c2a3d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 06:34:57 +0000
Subject: [PATCH 001/181] Introduce a higher level `INPUT_REGISTRY`

---
 vllm/inputs/__init__.py             | 13 +++++++++++
 vllm/{inputs.py => inputs/data.py}  |  0
 vllm/inputs/registry.py             | 36 +++++++++++++++++++++++++++++
 vllm/model_executor/models/llava.py |  8 +++----
 vllm/multimodal/registry.py         |  5 +---
 5 files changed, 54 insertions(+), 8 deletions(-)
 create mode 100644 vllm/inputs/__init__.py
 rename vllm/{inputs.py => inputs/data.py} (100%)
 create mode 100644 vllm/inputs/registry.py

diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
new file mode 100644
index 000000000000..92aa3e18493f
--- /dev/null
+++ b/vllm/inputs/__init__.py
@@ -0,0 +1,13 @@
+from .data import (
+    ParsedText, ParsedTokens, parse_and_batch_prompt,
+    TextPrompt, TokensPrompt, TextTokensPrompt,
+    PromptStrictInputs, PromptInputs, LLMInputs,
+)
+from .registry import INPUT_REGISTRY, InputRegistry
+
+__all__ = [
+    "ParsedText", "ParsedTokens", "parse_and_batch_prompt",
+    "TextPrompt", "TokensPrompt", "TextTokensPrompt",
+    "PromptStrictInputs", "PromptInputs", "LLMInputs",
+    "INPUT_REGISTRY", "InputRegistry"
+]
diff --git a/vllm/inputs.py b/vllm/inputs/data.py
similarity index 100%
rename from vllm/inputs.py
rename to vllm/inputs/data.py
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
new file mode 100644
index 000000000000..5a6b5e6914fa
--- /dev/null
+++ b/vllm/inputs/registry.py
@@ -0,0 +1,36 @@
+from typing import TYPE_CHECKING, Type, TypeVar
+
+from vllm.logger import init_logger
+from vllm.multimodal.base import MultiModalData
+from vllm.multimodal.registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+
+if TYPE_CHECKING:
+    from torch import nn
+
+logger = init_logger(__name__)
+
+D = TypeVar("D", bound=MultiModalData)
+N = TypeVar("N", bound=Type["nn.Module"])
+
+
+class InputRegistry:
+    """
+    This registry is used by model runners to dispatch data processing
+    according to its modality and the target model.
+    """
+
+    def __init__(self,
+                 *,
+                 multimodal_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
+                 ) -> None:
+        self._multimodal_registry = multimodal_registry
+    
+    @property
+    def MULTIMODAL(self) -> MultiModalRegistry:
+        """Access the registry for processing multimodal inputs."""
+        return self._multimodal_registry
+
+
+
+INPUT_REGISTRY = InputRegistry()
+"""The global :class:`~InputRegistry` which is used by model runners."""
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3332bcc57846..55f2d7e9b01d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -8,6 +8,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -17,7 +18,6 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
@@ -84,9 +84,9 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input()
+@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input()
+@INPUT_REGISTRY.MULTIMODAL.register_dummy_data(get_dummy_image_data)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4789ce5ce4cf..842b071d5642 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -5,12 +5,11 @@
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalData, MultiModalInputProcessor, MultiModalPlugin
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
 if TYPE_CHECKING:
-    import torch
     from torch import nn
 
     from vllm.sequence import SequenceData
@@ -20,8 +19,6 @@
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
 MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
                                   Tuple["SequenceData", MultiModalData]]
 

From df2aa191fd4dafb90fc70f0f1061bdb6a7be4f63 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 09:34:25 +0000
Subject: [PATCH 002/181] Move dummy data generation to input registry

---
 .../dev/multimodal/multimodal_index.rst       |   4 +-
 vllm/config.py                                |   3 +
 vllm/engine/arg_utils.py                      |  65 +++---
 vllm/inputs/__init__.py                       |  24 +-
 vllm/inputs/registry.py                       | 122 ++++++++--
 vllm/model_executor/models/llava.py           |   5 +-
 vllm/multimodal/__init__.py                   |   5 +-
 vllm/multimodal/base.py                       |  12 +-
 vllm/multimodal/image.py                      | 211 +++++++++++++++---
 vllm/multimodal/registry.py                   |  64 +-----
 vllm/sequence.py                              |   4 +-
 vllm/worker/cpu_model_runner.py               |   4 +-
 vllm/worker/model_runner.py                   |  14 +-
 13 files changed, 364 insertions(+), 173 deletions(-)

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index a25eceecc276..719d6e12ddfd 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -24,9 +24,7 @@ Module Contents
 Registry
 --------
 
-.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
-
-    The global :class:`MultiModalRegistry` which is used by model runners.
+.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
 
 .. autoclass:: vllm.multimodal.MultiModalRegistry
     :members:
diff --git a/vllm/config.py b/vllm/config.py
index eee62d268383..77f0fb616b0c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -103,6 +103,7 @@ def __init__(
         disable_sliding_window: bool = False,
         skip_tokenizer_init: bool = False,
         served_model_name: Optional[Union[str, List[str]]] = None,
+        multimodal_config: Optional["VisionLanguageConfig"] = None,
     ) -> None:
         self.model = model
         self.tokenizer = tokenizer
@@ -137,6 +138,8 @@ def __init__(
             sliding_window_len=self.get_hf_config_sliding_window())
         self.served_model_name = get_served_model_name(model,
                                                        served_model_name)
+        self.multimodal_config = multimodal_config
+
         if not self.skip_tokenizer_init:
             self._verify_tokenizer_mode()
         self._verify_embedding_mode()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b315d4d2ece2..94241f603c8d 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -611,6 +611,37 @@ def create_engine_config(self, ) -> EngineConfig:
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
 
+        if self.image_input_type:
+            if (not self.image_token_id or not self.image_input_shape
+                    or not self.image_feature_size):
+                raise ValueError(
+                    'Specify `image_token_id`, `image_input_shape` and '
+                    '`image_feature_size` together with `image_input_type`.')
+
+            if self.image_processor is None:
+                self.image_processor = self.model
+            if self.disable_image_processor:
+                if self.image_processor != self.model:
+                    warnings.warn(
+                        "You've specified an image processor "
+                        f"({self.image_processor}) but also disabled "
+                        "it via `--disable-image-processor`.",
+                        stacklevel=2)
+
+                self.image_processor = None
+
+            vision_language_config = VisionLanguageConfig(
+                image_input_type=VisionLanguageConfig.
+                get_image_input_enum_type(self.image_input_type),
+                image_token_id=self.image_token_id,
+                image_input_shape=str_to_int_tuple(self.image_input_shape),
+                image_feature_size=self.image_feature_size,
+                image_processor=self.image_processor,
+                image_processor_revision=self.image_processor_revision,
+            )
+        else:
+            vision_language_config = None
+
         device_config = DeviceConfig(self.device)
         model_config = ModelConfig(
             self.model, self.tokenizer, self.tokenizer_mode,
@@ -620,7 +651,8 @@ def create_engine_config(self, ) -> EngineConfig:
             self.quantization_param_path, self.enforce_eager,
             self.max_context_len_to_capture, self.max_seq_len_to_capture,
             self.max_logprobs, self.disable_sliding_window,
-            self.skip_tokenizer_init, self.served_model_name)
+            self.skip_tokenizer_init, self.served_model_name,
+            vision_language_config)
         cache_config = CacheConfig(self.block_size,
                                    self.gpu_memory_utilization,
                                    self.swap_space, self.kv_cache_dtype,
@@ -691,37 +723,6 @@ def create_engine_config(self, ) -> EngineConfig:
             model_loader_extra_config=self.model_loader_extra_config,
         )
 
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
-                raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
-
-            vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
-                image_token_id=self.image_token_id,
-                image_input_shape=str_to_int_tuple(self.image_input_shape),
-                image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
-            )
-        else:
-            vision_language_config = None
-
         decoding_config = DecodingConfig(
             guided_decoding_backend=self.guided_decoding_backend)
 
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 92aa3e18493f..6288503bfe19 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,13 +1,17 @@
-from .data import (
-    ParsedText, ParsedTokens, parse_and_batch_prompt,
-    TextPrompt, TokensPrompt, TextTokensPrompt,
-    PromptStrictInputs, PromptInputs, LLMInputs,
-)
-from .registry import INPUT_REGISTRY, InputRegistry
+from vllm.multimodal import MULTIMODAL_REGISTRY
+
+from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
+                   PromptStrictInputs, TextPrompt, TextTokensPrompt,
+                   TokensPrompt, parse_and_batch_prompt)
+from .registry import InputRegistry
+
+INPUT_REGISTRY = InputRegistry(multimodal_registry=MULTIMODAL_REGISTRY)
+"""The global :class:`~InputRegistry` which is used by model runners."""
+
+del MULTIMODAL_REGISTRY
 
 __all__ = [
-    "ParsedText", "ParsedTokens", "parse_and_batch_prompt",
-    "TextPrompt", "TokensPrompt", "TextTokensPrompt",
-    "PromptStrictInputs", "PromptInputs", "LLMInputs",
-    "INPUT_REGISTRY", "InputRegistry"
+    "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
+    "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs",
+    "LLMInputs", "INPUT_REGISTRY", "InputRegistry"
 ]
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 5a6b5e6914fa..3c2105c8e93d 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,16 +1,64 @@
-from typing import TYPE_CHECKING, Type, TypeVar
+from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
+                    TypeVar)
+
+from torch import nn
+from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
-from vllm.multimodal.base import MultiModalData
-from vllm.multimodal.registry import MULTIMODAL_REGISTRY, MultiModalRegistry
 
 if TYPE_CHECKING:
-    from torch import nn
+    from vllm.config import ModelConfig, VisionLanguageConfig
+    from vllm.multimodal import MultiModalData, MultiModalRegistry
+    from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
+D = TypeVar("D", bound="MultiModalData")
+N = TypeVar("N", bound=Type[nn.Module])
+
+DummyDataFactory = Callable[[int, "ModelConfig"],
+                            Tuple["SequenceData", Optional["MultiModalData"]]]
+"""Create dummy data to be inputted into the model."""
+
+C = TypeVar("C", bound=PretrainedConfig)
+
+
+class DummyDataFactories:
+
+    @classmethod
+    def for_multimodal_hf(cls, hf_config_type: Type[C]):
+        """Decorates a dummy data factory that uses multimodal config as well
+        as a specific type of HuggingFace config.
+        
+        The returned function satisfies the interface of
+        :data:`DummyDataFactory`, with runtime checks being made to ensure
+        the validity of the inputs."""
+
+        def wrapper(
+            factory: Callable[[int, "VisionLanguageConfig", C],
+                              Tuple["SequenceData",
+                                    Optional["MultiModalData"]]],
+        ) -> DummyDataFactory:
+
+            def inner(
+                seq_len: int,
+                model_config: "ModelConfig",
+            ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+                multimodal_config = model_config.multimodal_config
+                if multimodal_config is None:
+                    raise ValueError("No multimodal config found")
+
+                hf_config = model_config.hf_config
+                if not isinstance(hf_config, hf_config_type):
+                    raise TypeError("Invalid type of HuggingFace config. "
+                                    f"Expected type: {hf_config_type}, but "
+                                    f"received type: {type(hf_config)}")
+
+                return factory(seq_len, multimodal_config, hf_config)
+
+            return inner
+
+        return wrapper
 
 
 class InputRegistry:
@@ -19,18 +67,64 @@ class InputRegistry:
     according to its modality and the target model.
     """
 
-    def __init__(self,
-                 *,
-                 multimodal_registry: MultiModalRegistry = MULTIMODAL_REGISTRY,
-                 ) -> None:
+    def __init__(self, *, multimodal_registry: "MultiModalRegistry") -> None:
         self._multimodal_registry = multimodal_registry
-    
+
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+                                                  DummyDataFactory] = {}
+
     @property
-    def MULTIMODAL(self) -> MultiModalRegistry:
+    def MULTIMODAL(self) -> "MultiModalRegistry":
         """Access the registry for processing multimodal inputs."""
         return self._multimodal_registry
 
+    def _default_dummy_data_factory(
+        self,
+        seq_len: int,
+        model_config: "ModelConfig",
+    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+        """Create dummy data to be inputted into the model."""
+        # Avoid circular import
+        from vllm.sequence import SequenceData
+
+        dummy_seq_data = SequenceData([0] * seq_len)
+        dummy_multi_modal_data = None
+
+        return dummy_seq_data, dummy_multi_modal_data
+
+    def register_dummy_data(self, factory: DummyDataFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The resulting memory usage
+        should be an upper bound of what the model would use at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def dummy_data_for_profiling(
+        self,
+        seq_len: int,
+        model_config: "ModelConfig",
+    ):
+        """Create dummy data for memory profiling."""
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
 
+        model_cls, _ = get_model_architecture(model_config)
+        dummy_factory = self._dummy_factories_by_model_type \
+            .get(model_cls, self._default_dummy_data_factory)
 
-INPUT_REGISTRY = InputRegistry()
-"""The global :class:`~InputRegistry` which is used by model runners."""
+        return dummy_factory(seq_len, model_config)
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 55f2d7e9b01d..db82673e29c2 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import DummyImageDataFactories
 from vllm.sequence import SamplerOutput
 
 from .vlm_base import VisionLanguageModelBase
@@ -86,7 +86,8 @@ class LlavaImageFeatureInputs(TypedDict):
 
 @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input()
 @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input()
-@INPUT_REGISTRY.MULTIMODAL.register_dummy_data(get_dummy_image_data)
+@INPUT_REGISTRY.register_dummy_data(
+    DummyImageDataFactories.for_model(LlavaConfig))
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 270012e7d1c3..c97586258c90 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,5 +1,8 @@
 from .base import MultiModalData, MultiModalPlugin
-from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+from .registry import MultiModalRegistry
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""The global :class:`~MultiModalRegistry` which is used by model runners."""
 
 __all__ = [
     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 847752449ba8..17c48865175f 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -50,13 +50,6 @@ class MultiModalPlugin(ABC, Generic[D]):
     (i.e., the modality of the data).
     """
 
-    @classmethod
-    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
-        # Avoid circular import
-        from vllm.model_executor.model_loader import get_model_architecture
-
-        return get_model_architecture(model_config)[0]
-
     def __init__(self) -> None:
         self._input_processors: Dict[Type["nn.Module"],
                                      MultiModalInputProcessor[D]] = {}
@@ -116,7 +109,10 @@ def process_input(
         for compatibility purposes and may be merged into ``model_config``
         in the near future.
         """
-        model_cls = self.get_model_cls(model_config)
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
 
         processor = self._input_processors.get(model_cls)
         if processor is None:
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index b964e9ee4262..498f4bcc1b0d 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,9 +1,14 @@
-from typing import Dict, Tuple, Type, Union
+from typing import Dict, Optional, Tuple, Type, Union
 
 import torch
 from PIL import Image
+from transformers import (CLIPVisionConfig, LlavaConfig, LlavaNextConfig,
+                          PretrainedConfig)
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape)
 
 from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.inputs.registry import DummyDataFactories, DummyDataFactory
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import cached_get_image_processor
@@ -13,49 +18,185 @@
 logger = init_logger(__name__)
 
 
-def _get_dummy_seq_data(seq_len: int,
-                        vlm_config: VisionLanguageConfig) -> SequenceData:
+def _get_dummy_seq_data(
+    *,
+    seq_len: int,
+    image_token_id: int,
+    image_feature_size: int,
+) -> SequenceData:
     # NOTE: We assume that <image> token is repeated `image_feature_size` times
     # and then concatenated with the text prompt
     # TODO: Enable other ways of inserting the image into the prompt
 
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
 
     return SequenceData(token_ids)
 
 
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
-    else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
-
+class DummyImageDataFactories:
+
+    @classmethod
+    def _get_clip_num_patches(
+        cls,
+        hf_config: CLIPVisionConfig,
+    ) -> int:
+        image_size = hf_config.image_size
+        patch_size = hf_config.patch_size
+
+        assert image_size % patch_size == 0
+        return image_size // patch_size
+
+    @classmethod
+    def _dummy_data_for_clip(
+        cls,
+        seq_len: int,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: CLIPVisionConfig,
+        *,
+        image_token_id: int,
+        image_feature_size_override: Optional[int] = None,
+    ):
+        if image_feature_size_override is None:
+            num_patches = cls._get_clip_num_patches(hf_config)
+            image_feature_size = num_patches * num_patches
+        else:
+            image_feature_size = image_feature_size_override
+
+        seq_data = _get_dummy_seq_data(
+            seq_len=seq_len,
+            image_token_id=image_token_id,
+            image_feature_size=image_feature_size,
+        )
 
-def get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    """Standard dummy data factory for image data (to be used in
-    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
-    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        multi_modal_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            width = height = hf_config.image_size
+            if multimodal_config.image_processor is None:
+                values_dtype = torch.float16
+            else:
+                values_dtype = torch.uint8
+
+            values = torch.zeros((1, 3, width, height), dtype=values_dtype)
+            multi_modal_data = ImagePixelData(values)
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            depth = hf_config.hidden_size
+            values_dtype = torch.float16
+
+            values = torch.zeros((1, image_feature_size, depth),
+                                 dtype=values_dtype)
+            multi_modal_data = ImageFeatureData(values)
+
+        return seq_data, multi_modal_data
+
+    @classmethod
+    def _dummy_data_for_llava(
+        cls,
+        seq_len: int,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: LlavaConfig,
+    ):
+        vision_config = hf_config.vision_config
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            return cls._dummy_data_for_clip(
+                seq_len=seq_len,
+                multimodal_config=multimodal_config,
+                hf_config=vision_config,
+                image_token_id=hf_config.image_token_index,
+            )
+
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
+
+    @classmethod
+    def _get_llava_next_num_unpadded_features(
+        cls,
+        height: int,
+        width: int,
+        npatches: int,
+        num_patch_height: int,
+        num_patch_width: int,
+    ) -> Tuple[int, int]:
+        # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
+        current_height = npatches * num_patch_height
+        current_width = npatches * num_patch_width
+
+        aspect_ratio: float = width / height
+        current_aspect_ratio: float = current_width / current_height
+        if aspect_ratio > current_aspect_ratio:
+            new_height = (height * current_width) // width
+            current_height = new_height
+        else:
+            new_width = (width * current_height) // height
+            current_width = new_width
+
+        unpadded_features = current_height * current_width
+        newline_features = current_height
+        return (unpadded_features, newline_features)
+
+    @classmethod
+    def _dummy_data_for_llava_next(
+        cls,
+        seq_len: int,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: LlavaNextConfig,
+    ):
+        vision_config = hf_config.vision_config
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            num_patches = cls._get_clip_num_patches(vision_config)
+            base_feature_size = num_patches * num_patches
+
+            # Results in the max possible feature size
+            dummy_height, dummy_width = 448, 448
+            num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                image_size=(dummy_height, dummy_width),
+                grid_pinpoints=hf_config.image_grid_pinpoints,
+                patch_size=vision_config.image_size,
+            )
+
+            (
+                unpadded_feature_size,
+                newline_feature_size,
+            ) = cls._get_llava_next_num_unpadded_features(
+                dummy_height,
+                dummy_width,
+                num_patches,
+                num_patch_height,
+                num_patch_width,
+            )
+
+            image_feature_size = unpadded_feature_size + newline_feature_size \
+                + base_feature_size
+
+            return cls._dummy_data_for_clip(
+                seq_len=seq_len,
+                multimodal_config=multimodal_config,
+                hf_config=vision_config,
+                image_token_id=hf_config.image_token_index,
+                image_feature_size_override=image_feature_size,
+            )
+
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
+
+    @classmethod
+    def for_model(
+        cls,
+        hf_config_type: Type[PretrainedConfig],
+    ) -> DummyDataFactory:
+        if hf_config_type == LlavaConfig:
+            return DummyDataFactories.for_multimodal_hf(LlavaConfig) \
+                (cls._dummy_data_for_llava)
+        if hf_config_type == LlavaNextConfig:
+            return DummyDataFactories.for_multimodal_hf(LlavaNextConfig) \
+                (cls._dummy_data_for_llava_next)
+
+        msg = f"Unsupported model config: {type(hf_config_type)}"
+        raise NotImplementedError(msg)
 
 
 class ImagePixelData(MultiModalData):
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 842b071d5642..9641efccc08a 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,6 +1,7 @@
 import functools
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
-                    Tuple, Type, TypeVar)
+from typing import Any, Optional, Sequence, Type, TypeVar
+
+from torch import nn
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
@@ -9,18 +10,10 @@
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
-if TYPE_CHECKING:
-    from torch import nn
-
-    from vllm.sequence import SequenceData
-
 logger = init_logger(__name__)
 
 D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
-
-MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
-                                  Tuple["SequenceData", MultiModalData]]
+N = TypeVar("N", bound=Type[nn.Module])
 
 
 class MultiModalRegistry:
@@ -31,13 +24,12 @@ class MultiModalRegistry:
 
     DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
 
-    def __init__(self,
-                 *,
-                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
-                 ) -> None:
+    def __init__(
+        self,
+        *,
+        plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS,
+    ) -> None:
         self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
-        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
-                                                  MultiModalDummyFactory] = {}
 
     def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
         data_type = plugin.get_data_type()
@@ -59,40 +51,6 @@ def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
         msg = f"Unknown multi-modal data type: {data_type}"
         raise NotImplementedError(msg)
 
-    def register_dummy_data(self, factory: MultiModalDummyFactory):
-        """
-        Register a dummy data factory to a model class.
-
-        During memory profiling, the provided function is invoked to create
-        dummy data to be inputted into the model. The modality and shape of
-        the dummy data should be an upper bound of what the model would receive
-        at inference time.
-        """
-
-        def wrapper(model_cls: N) -> N:
-            if model_cls in self._dummy_factories_by_model_type:
-                logger.warning(
-                    "Model class %s already has dummy data "
-                    "registered to %s. It is overwritten by the new one.",
-                    model_cls, self)
-
-            self._dummy_factories_by_model_type[model_cls] = factory
-
-            return model_cls
-
-        return wrapper
-
-    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
-                                 vlm_config: VisionLanguageConfig):
-        """Create dummy data for memory profiling."""
-        model_cls = MultiModalPlugin.get_model_cls(model_config)
-        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
-        if dummy_factory is None:
-            msg = f"No dummy data defined for model class: {model_cls}"
-            raise NotImplementedError(msg)
-
-        return dummy_factory(seq_len, model_config, vlm_config)
-
     def register_input(
             self,
             data_type: Type[D],
@@ -147,7 +105,3 @@ def create_input_processor(self, model_config: ModelConfig,
         return functools.partial(self.process_input,
                                  model_config=model_config,
                                  vlm_config=vlm_config)
-
-
-MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 2f27bf33b166..4d549893f982 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -8,12 +8,12 @@
 import torch
 
 from vllm.block import LogicalTokenBlock
-from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
+    from vllm.inputs import LLMInputs
     from vllm.multimodal import MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
@@ -221,7 +221,7 @@ class Sequence:
     def __init__(
         self,
         seq_id: int,
-        inputs: LLMInputs,
+        inputs: "LLMInputs",
         block_size: int,
         eos_token_id: Optional[int] = None,
         lora_request: Optional[LoRARequest] = None,
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index eaf43247d4fc..11630dd4612d 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -9,10 +9,10 @@
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 
@@ -67,7 +67,7 @@ def __init__(
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+            self.multi_modal_input_processor = INPUT_REGISTRY.MULTIMODAL \
                 .create_input_processor(
                     self.model_config,
                     self.vision_language_config,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 63ec22d79694..fb655b07748f 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -13,15 +13,15 @@
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
 from vllm.distributed.communication_op import graph_capture
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
+from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 
@@ -125,7 +125,7 @@ def __init__(
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
-            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+            self.multi_modal_input_processor = INPUT_REGISTRY.MULTIMODAL \
                 .create_input_processor(
                     self.model_config,
                     self.vision_language_config,
@@ -806,12 +806,8 @@ def profile_run(self) -> None:
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            if vlm_config is None:
-                seq_data = SequenceData([0] * seq_len)
-                dummy_multi_modal_data = None
-            else:
-                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
-                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(seq_len, model_config)
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),

From c72d2b34f8c6023028eea6fc5818d61b561ea580 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 10:38:59 +0000
Subject: [PATCH 003/181] Update docs

---
 vllm/inputs/registry.py  | 39 +++++++++++++++++++++++++++++++++++++--
 vllm/multimodal/image.py |  5 +++++
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 3c2105c8e93d..c060c5d681a4 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -24,15 +24,50 @@
 
 
 class DummyDataFactories:
+    """Contains factories for dummy data factories."""
+
+    @classmethod
+    def for_hf(cls, hf_config_type: Type[C]):
+        """
+        Decorate a dummy data factory that uses a specific type of
+        HuggingFace config.
+        
+        The returned function satisfies the interface of
+        :data:`DummyDataFactory`, with runtime checks being made to ensure
+        the validity of the inputs.
+        """
+
+        def wrapper(
+            factory: Callable[[int, C], Tuple["SequenceData",
+                                              Optional["MultiModalData"]]],
+        ) -> DummyDataFactory:
+
+            def inner(
+                seq_len: int,
+                model_config: "ModelConfig",
+            ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+                hf_config = model_config.hf_config
+                if not isinstance(hf_config, hf_config_type):
+                    raise TypeError("Invalid type of HuggingFace config. "
+                                    f"Expected type: {hf_config_type}, but "
+                                    f"received type: {type(hf_config)}")
+
+                return factory(seq_len, hf_config)
+
+            return inner
+
+        return wrapper
 
     @classmethod
     def for_multimodal_hf(cls, hf_config_type: Type[C]):
-        """Decorates a dummy data factory that uses multimodal config as well
+        """
+        Decorate a dummy data factory that uses multimodal config as well
         as a specific type of HuggingFace config.
         
         The returned function satisfies the interface of
         :data:`DummyDataFactory`, with runtime checks being made to ensure
-        the validity of the inputs."""
+        the validity of the inputs.
+        """
 
         def wrapper(
             factory: Callable[[int, "VisionLanguageConfig", C],
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 498f4bcc1b0d..084dde84bc2a 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -35,6 +35,7 @@ def _get_dummy_seq_data(
 
 
 class DummyImageDataFactories:
+    """Contains factories for dummy image data factories."""
 
     @classmethod
     def _get_clip_num_patches(
@@ -188,6 +189,10 @@ def for_model(
         cls,
         hf_config_type: Type[PretrainedConfig],
     ) -> DummyDataFactory:
+        """
+        Create an dummy image data factory for a model as identified
+        by the config type.
+        """
         if hf_config_type == LlavaConfig:
             return DummyDataFactories.for_multimodal_hf(LlavaConfig) \
                 (cls._dummy_data_for_llava)

From d8c6488dd3fc2ecdbe6af1101e0317e65e49a551 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 11:11:25 +0000
Subject: [PATCH 004/181] Rename `process_input` to `map_input`

---
 .../{test_processor.py => test_mapper.py}     |  6 +-
 vllm/inputs/registry.py                       |  5 ++
 vllm/model_executor/models/llava.py           |  4 +-
 vllm/multimodal/base.py                       | 49 ++++++++-------
 vllm/multimodal/image.py                      |  4 +-
 vllm/multimodal/registry.py                   | 62 +++++++++----------
 vllm/worker/cpu_model_runner.py               | 10 +--
 vllm/worker/model_runner.py                   | 10 +--
 8 files changed, 78 insertions(+), 72 deletions(-)
 rename tests/multimodal/{test_processor.py => test_mapper.py} (94%)

diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_mapper.py
similarity index 94%
rename from tests/multimodal/test_processor.py
rename to tests/multimodal/test_mapper.py
index 4aeae633d07f..13c1d1e342c7 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_mapper.py
@@ -38,7 +38,7 @@ def test_clip_image_processor(hf_images, dtype):
             image,
             return_tensors="np",
         )
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
             ImagePixelData(image),
             model_config=model_config,
             vlm_config=vlm_config,
@@ -76,12 +76,12 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
     )
 
     for image, tensor in zip(hf_images, vllm_image_tensors):
-        image_result = MULTIMODAL_REGISTRY.process_input(
+        image_result = MULTIMODAL_REGISTRY.map_input(
             ImagePixelData(image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
-        tensor_result = MULTIMODAL_REGISTRY.process_input(
+        tensor_result = MULTIMODAL_REGISTRY.map_input(
             ImagePixelData(tensor),
             model_config=model_config,
             vlm_config=vlm_config,
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index c060c5d681a4..f84216783b4b 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -6,6 +6,8 @@
 
 from vllm.logger import init_logger
 
+from .data import LLMInputs
+
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VisionLanguageConfig
     from vllm.multimodal import MultiModalData, MultiModalRegistry
@@ -20,6 +22,9 @@
                             Tuple["SequenceData", Optional["MultiModalData"]]]
 """Create dummy data to be inputted into the model."""
 
+InputProcessor = Callable[[LLMInputs], LLMInputs]
+"""Processes the inputs to the model."""
+
 C = TypeVar("C", bound=PretrainedConfig)
 
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index db82673e29c2..91edee8b804f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -84,8 +84,8 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
-@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input()
-@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input()
+@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaConfig))
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 17c48865175f..9f252af13d36 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -32,8 +32,8 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                    Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[D, ModelConfig, VisionLanguageConfig],
+                                 Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
@@ -51,8 +51,8 @@ class MultiModalPlugin(ABC, Generic[D]):
     """
 
     def __init__(self) -> None:
-        self._input_processors: Dict[Type["nn.Module"],
-                                     MultiModalInputProcessor[D]] = {}
+        self._input_mappers: Dict[Type["nn.Module"],
+                                  MultiModalInputMapper[D]] = {}
 
     @abstractmethod
     def get_data_type(self) -> Type[D]:
@@ -63,7 +63,7 @@ def get_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_processor(
+    def _default_input_mapper(
             self, data: D, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
@@ -72,39 +72,40 @@ def _default_input_processor(
         """
         raise NotImplementedError
 
-    def register_input_processor(self,
-                                 processor: Optional[
-                                     MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor to a model class.
+        Register an input mapper to a model class.
         
         When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided input processor is
-        applied to preprocess the data. If `None` is provided, then the default
-        input processor is applied instead.
+        this plugin (see :meth:`get_data_type`), the provided function is
+        invoked to transform the data into a dictionary of model inputs.
+        If `None` is provided, then the default input mapper is used instead.
         """
 
         def wrapper(model_cls: N) -> N:
-            if model_cls in self._input_processors:
+            if model_cls in self._input_mappers:
                 logger.warning(
-                    "Model class %s already has an input processor "
+                    "Model class %s already has an input mapper "
                     "registered to %s. It is overwritten by the new one.",
                     model_cls, self)
 
-            self._input_processors[model_cls] = processor \
-                or self._default_input_processor
+            self._input_mappers[model_cls] = mapper \
+                or self._default_input_mapper
 
             return model_cls
 
         return wrapper
 
-    def process_input(
+    def map_input(
             self, data: D, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
-        
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
+        to the model, transforming the data into a dictionary of model inputs.
+
         The model is identified by ``model_config``. ``vlm_config`` is
         for compatibility purposes and may be merged into ``model_config``
         in the near future.
@@ -114,9 +115,9 @@ def process_input(
 
         model_cls, _ = get_model_architecture(model_config)
 
-        processor = self._input_processors.get(model_cls)
-        if processor is None:
-            raise KeyError(f"No input processor in {self} is registered for "
+        mapper = self._input_mappers.get(model_cls)
+        if mapper is None:
+            raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return processor(data, model_config, vlm_config)
+        return mapper(data, model_config, vlm_config)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 084dde84bc2a..4ec5b1abc0d1 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -238,7 +238,7 @@ def _get_hf_image_processor(self, model_config: ModelConfig,
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_processor(
+    def _default_input_mapper(
             self, data: ImagePixelData, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
         image = data.image
@@ -279,7 +279,7 @@ class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
     def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
-    def _default_input_processor(
+    def _default_input_mapper(
             self, data: ImageFeatureData, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
         image_features = data.image_features.to(model_config.dtype)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9641efccc08a..13d8059c279a 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,7 @@
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalInputProcessor, MultiModalPlugin
+from .base import MultiModalData, MultiModalInputMapper, MultiModalPlugin
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
@@ -51,57 +51,57 @@ def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
         msg = f"Unknown multi-modal data type: {data_type}"
         raise NotImplementedError(msg)
 
-    def register_input(
-            self,
-            data_type: Type[D],
-            processor: Optional[MultiModalInputProcessor[D]] = None):
+    def register_input_mapper(
+        self,
+        data_type: Type[D],
+        mapper: Optional[MultiModalInputMapper[D]] = None,
+    ):
         """
-        Register an input processor for a specific modality to a model class.
+        Register an input mapper for a specific modality to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
         return self._get_plugin_for_data_type(data_type) \
-            .register_input_processor(processor)
+            .register_input_mapper(mapper)
 
-    def register_image_pixel_input(
-            self,
-            processor: Optional[
-                MultiModalInputProcessor[ImagePixelData]] = None):
+    def register_image_pixel_input_mapper(
+        self,
+        mapper: Optional[MultiModalInputMapper[ImagePixelData]] = None,
+    ):
         """
-        Register an input processor for image pixel data to a model class.
+        Register an input mapper for image pixel data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImagePixelData, processor)
+        return self.register_input_mapper(ImagePixelData, mapper)
 
-    def register_image_feature_input(
+    def register_image_feature_input_mapper(
         self,
-        processor: Optional[
-            MultiModalInputProcessor[ImageFeatureData]] = None):
+        mapper: Optional[MultiModalInputMapper[ImageFeatureData]] = None,
+    ):
         """
-        Register an input processor for image feature data to a model class.
+        Register an input mapper for image feature data to a model class.
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input(ImageFeatureData, processor)
+        return self.register_input_mapper(ImageFeatureData, mapper)
 
-    def process_input(self, data: MultiModalData, model_config: ModelConfig,
-                      vlm_config: VisionLanguageConfig):
+    def map_input(self, data: MultiModalData, model_config: ModelConfig,
+                  vlm_config: VisionLanguageConfig):
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
+        Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
         
-        See :meth:`MultiModalPlugin.process_input` for more details.
+        See :meth:`MultiModalPlugin.map_input` for more details.
         """
         return self._get_plugin_for_data_type(type(data)) \
-            .process_input(data, model_config, vlm_config)
+            .map_input(data, model_config, vlm_config)
 
-    def create_input_processor(self, model_config: ModelConfig,
-                               vlm_config: VisionLanguageConfig):
+    def create_input_mapper(self, model_config: ModelConfig,
+                            vlm_config: VisionLanguageConfig):
         """
-        Create an input processor (see :meth:`process_input`) for a
-        specific model.
+        Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return functools.partial(self.process_input,
+        return functools.partial(self.map_input,
                                  model_config=model_config,
                                  vlm_config=vlm_config)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 11630dd4612d..e8c79ce9d9d5 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -67,13 +67,13 @@ def __init__(
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
-            self.multi_modal_input_processor = INPUT_REGISTRY.MULTIMODAL \
-                .create_input_processor(
+            self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+                .create_input_mapper(
                     self.model_config,
                     self.vision_language_config,
                 )
         else:
-            self.multi_modal_input_processor = None
+            self.multi_modal_input_mapper = None
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
@@ -124,12 +124,12 @@ def _prepare_prompt(
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data is not None:
                 # Process multi-modal data
-                if self.multi_modal_input_processor is None:
+                if self.multi_modal_input_mapper is None:
                     raise ValueError(
                         "Multi-modal inputs are only supported by "
                         "vision language models.")
 
-                mm_kwargs = self.multi_modal_input_processor(mm_data)
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 for k, v in mm_kwargs.items():
                     multi_modal_kwargs_list[k].append(v)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index fb655b07748f..94480c9f9095 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -125,13 +125,13 @@ def __init__(
 
         # Create processor for multi-modal data
         if self.vision_language_config is not None:
-            self.multi_modal_input_processor = INPUT_REGISTRY.MULTIMODAL \
-                .create_input_processor(
+            self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+                .create_input_mapper(
                     self.model_config,
                     self.vision_language_config,
                 )
         else:
-            self.multi_modal_input_processor = None
+            self.multi_modal_input_mapper = None
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
@@ -432,12 +432,12 @@ def _prepare_model_input(
                 mm_data = seq_group_metadata.multi_modal_data
                 if mm_data is not None:
                     # Process multi-modal data
-                    if self.multi_modal_input_processor is None:
+                    if self.multi_modal_input_mapper is None:
                         raise ValueError(
                             "Multi-modal inputs are only supported by "
                             "vision language models.")
 
-                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     for k, v in mm_kwargs.items():
                         multi_modal_kwargs_list[k].append(v)
 

From f18de48734bfbfde9969e8e6f59c3a02ddbf342a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 12:38:07 +0000
Subject: [PATCH 005/181] Reorder arguments

---
 vllm/inputs/registry.py  | 183 +++++++++++++++++++++++++++++++--------
 vllm/multimodal/image.py |  10 +--
 2 files changed, 152 insertions(+), 41 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index f84216783b4b..deebb41b03bd 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,5 +1,6 @@
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
+from typing_extensions import Concatenate, ParamSpec
 
 from torch import nn
 from transformers import PretrainedConfig
@@ -18,16 +19,67 @@
 D = TypeVar("D", bound="MultiModalData")
 N = TypeVar("N", bound=Type[nn.Module])
 
-DummyDataFactory = Callable[[int, "ModelConfig"],
+DummyDataFactory = Callable[["ModelConfig", int],
                             Tuple["SequenceData", Optional["MultiModalData"]]]
 """Create dummy data to be inputted into the model."""
 
-InputProcessor = Callable[[LLMInputs], LLMInputs]
-"""Processes the inputs to the model."""
+InputProcessor = Callable[["ModelConfig", LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
 
+P, R = ParamSpec("P"), TypeVar("R")
 C = TypeVar("C", bound=PretrainedConfig)
 
 
+def _for_hf(hf_config_type: Type[C]):
+    def wrapper(
+        fn: Callable[Concatenate[C, P], R],
+    ) -> Callable[Concatenate["ModelConfig", P], R]:
+
+        def inner(
+            model_config: "ModelConfig",
+            *args: P.args,
+            **kwargs: P.kwargs,
+        ) -> R:
+            hf_config = model_config.hf_config
+            if not isinstance(hf_config, hf_config_type):
+                raise TypeError("Invalid type of HuggingFace config. "
+                                f"Expected type: {hf_config_type}, but "
+                                f"received type: {type(hf_config)}")
+
+            return fn(hf_config, *args, **kwargs)
+
+        return inner
+
+    return wrapper
+
+
+def _for_multimodal_hf(hf_config_type: Type[C]):
+    def wrapper(
+        factory: Callable[Concatenate["VisionLanguageConfig", C, P], R],
+    ) -> Callable[Concatenate["ModelConfig", P], R]:
+
+        def inner(
+            model_config: "ModelConfig",
+            *args: P.args,
+            **kwargs: P.kwargs,
+        ) -> R:
+            multimodal_config = model_config.multimodal_config
+            if multimodal_config is None:
+                raise ValueError("No multimodal config found")
+
+            hf_config = model_config.hf_config
+            if not isinstance(hf_config, hf_config_type):
+                raise TypeError("Invalid type of HuggingFace config. "
+                                f"Expected type: {hf_config_type}, but "
+                                f"received type: {type(hf_config)}")
+
+            return factory(multimodal_config, hf_config, *args, **kwargs)
+
+        return inner
+
+    return wrapper
+
+
 class DummyDataFactories:
     """Contains factories for dummy data factories."""
 
@@ -43,23 +95,10 @@ def for_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            factory: Callable[[int, C], Tuple["SequenceData",
+            factory: Callable[[C, int], Tuple["SequenceData",
                                               Optional["MultiModalData"]]],
         ) -> DummyDataFactory:
-
-            def inner(
-                seq_len: int,
-                model_config: "ModelConfig",
-            ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
-                hf_config = model_config.hf_config
-                if not isinstance(hf_config, hf_config_type):
-                    raise TypeError("Invalid type of HuggingFace config. "
-                                    f"Expected type: {hf_config_type}, but "
-                                    f"received type: {type(hf_config)}")
-
-                return factory(seq_len, hf_config)
-
-            return inner
+            return _for_hf(hf_config_type)(factory)
 
         return wrapper
 
@@ -75,28 +114,52 @@ def for_multimodal_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            factory: Callable[[int, "VisionLanguageConfig", C],
+            factory: Callable[["VisionLanguageConfig", C, int],
                               Tuple["SequenceData",
                                     Optional["MultiModalData"]]],
         ) -> DummyDataFactory:
+            return _for_multimodal_hf(hf_config_type)(factory)
 
-            def inner(
-                seq_len: int,
-                model_config: "ModelConfig",
-            ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
-                multimodal_config = model_config.multimodal_config
-                if multimodal_config is None:
-                    raise ValueError("No multimodal config found")
+        return wrapper
 
-                hf_config = model_config.hf_config
-                if not isinstance(hf_config, hf_config_type):
-                    raise TypeError("Invalid type of HuggingFace config. "
-                                    f"Expected type: {hf_config_type}, but "
-                                    f"received type: {type(hf_config)}")
 
-                return factory(seq_len, multimodal_config, hf_config)
+class InputProcessors:
+    """Contains factories for input processors."""
 
-            return inner
+    @classmethod
+    def for_hf(cls, hf_config_type: Type[C]):
+        """
+        Decorate an input processor that uses a specific type of
+        HuggingFace config.
+        
+        The returned function satisfies the interface of
+        :data:`InputProcessor`, with runtime checks being made to ensure
+        the validity of the inputs.
+        """
+
+        def wrapper(
+            processor: Callable[[C, LLMInputs], LLMInputs],
+        ) -> InputProcessor:
+            return _for_hf(hf_config_type)(processor)
+
+        return wrapper
+
+    @classmethod
+    def for_multimodal_hf(cls, hf_config_type: Type[C]):
+        """
+        Decorate an input processor that uses multimodal config as well
+        as a specific type of HuggingFace config.
+        
+        The returned function satisfies the interface of
+        :data:`InputProcessor`, with runtime checks being made to ensure
+        the validity of the inputs.
+        """
+
+        def wrapper(
+            processor: Callable[["VisionLanguageConfig", C, LLMInputs],
+                                LLMInputs],
+        ) -> InputProcessor:
+            return _for_multimodal_hf(hf_config_type)(processor)
 
         return wrapper
 
@@ -112,6 +175,8 @@ def __init__(self, *, multimodal_registry: "MultiModalRegistry") -> None:
 
         self._dummy_factories_by_model_type: Dict[Type[nn.Module],
                                                   DummyDataFactory] = {}
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
+                                                   InputProcessor] = {}
 
     @property
     def MULTIMODAL(self) -> "MultiModalRegistry":
@@ -120,8 +185,8 @@ def MULTIMODAL(self) -> "MultiModalRegistry":
 
     def _default_dummy_data_factory(
         self,
-        seq_len: int,
         model_config: "ModelConfig",
+        seq_len: int,
     ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
         """Create dummy data to be inputted into the model."""
         # Avoid circular import
@@ -156,8 +221,8 @@ def wrapper(model_cls: N) -> N:
 
     def dummy_data_for_profiling(
         self,
-        seq_len: int,
         model_config: "ModelConfig",
+        seq_len: int,
     ):
         """Create dummy data for memory profiling."""
         # Avoid circular import
@@ -167,4 +232,50 @@ def dummy_data_for_profiling(
         dummy_factory = self._dummy_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
-        return dummy_factory(seq_len, model_config)
+        return dummy_factory(model_config, seq_len)
+
+    def _default_input_processor(self, inputs: LLMInputs) -> LLMInputs:
+        """Preprocess the inputs to the model."""
+        return inputs
+
+    def register_input_processor(self, processor: InputProcessor):
+        """
+        Register an input processor to a model class.
+
+        The provided function is invoked on each input to the model. This
+        happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors_by_model_type:
+                logger.warning(
+                    "Model class %s already has input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors_by_model_type[model_cls] = processor
+
+            return model_cls
+
+        return wrapper
+
+    def process_input(self, model_config: "ModelConfig",
+                      inputs: LLMInputs) -> LLMInputs:
+        """
+        Apply an input processor to an instance of model inputs.
+
+        The model is identified by ``model_config``. ``vlm_config`` is
+        for compatibility purposes and may be merged into ``model_config``
+        in the near future.
+        """
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        model_cls, _ = get_model_architecture(model_config)
+
+        processor = self._input_processors_by_model_type.get(model_cls)
+        if processor is None:
+            raise KeyError(f"No input processor in {self} is registered for "
+                           f"model class {model_cls.__name__}.")
+
+        return processor(model_config, inputs)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 4ec5b1abc0d1..e7e1e5bbe93c 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -51,9 +51,9 @@ def _get_clip_num_patches(
     @classmethod
     def _dummy_data_for_clip(
         cls,
-        seq_len: int,
         multimodal_config: VisionLanguageConfig,
         hf_config: CLIPVisionConfig,
+        seq_len: int,
         *,
         image_token_id: int,
         image_feature_size_override: Optional[int] = None,
@@ -95,17 +95,17 @@ def _dummy_data_for_clip(
     @classmethod
     def _dummy_data_for_llava(
         cls,
-        seq_len: int,
         multimodal_config: VisionLanguageConfig,
         hf_config: LlavaConfig,
+        seq_len: int,
     ):
         vision_config = hf_config.vision_config
 
         if isinstance(vision_config, CLIPVisionConfig):
             return cls._dummy_data_for_clip(
-                seq_len=seq_len,
                 multimodal_config=multimodal_config,
                 hf_config=vision_config,
+                seq_len=seq_len,
                 image_token_id=hf_config.image_token_index,
             )
 
@@ -141,9 +141,9 @@ def _get_llava_next_num_unpadded_features(
     @classmethod
     def _dummy_data_for_llava_next(
         cls,
-        seq_len: int,
         multimodal_config: VisionLanguageConfig,
         hf_config: LlavaNextConfig,
+        seq_len: int,
     ):
         vision_config = hf_config.vision_config
 
@@ -174,9 +174,9 @@ def _dummy_data_for_llava_next(
                 + base_feature_size
 
             return cls._dummy_data_for_clip(
-                seq_len=seq_len,
                 multimodal_config=multimodal_config,
                 hf_config=vision_config,
+                seq_len=seq_len,
                 image_token_id=hf_config.image_token_index,
                 image_feature_size_override=image_feature_size,
             )

From 653537d99f43c211e69958e7ad536f23aaf5087b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 12:48:38 +0000
Subject: [PATCH 006/181] Apply input processor

---
 vllm/engine/async_llm_engine.py |  8 +++++---
 vllm/engine/llm_engine.py       | 13 +++++++++----
 vllm/inputs/registry.py         | 23 +++++++++++++++--------
 3 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index db4d2849b3f0..ddca39d67c9d 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -265,9 +265,11 @@ async def process_model_inputs_async(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     async def add_request_async(
         self,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
index cb5893e707c8..59327f3b56eb 100644
--- a/vllm/engine/llm_engine.py
+++ b/vllm/engine/llm_engine.py
@@ -21,7 +21,7 @@
 from vllm.engine.output_processor.util import create_output_by_sequence_group
 from vllm.executor.executor_base import ExecutorBase
 from vllm.executor.ray_utils import initialize_ray_cluster
-from vllm.inputs import LLMInputs, PromptInputs
+from vllm.inputs import INPUT_REGISTRY, LLMInputs, PromptInputs
 from vllm.logger import init_logger
 from vllm.lora.request import LoRARequest
 from vllm.outputs import (EmbeddingRequestOutput, RequestOutput,
@@ -219,6 +219,9 @@ def __init__(
         self.generation_config_fields = _load_generation_config_dict(
             model_config)
 
+        self.input_processor = INPUT_REGISTRY.create_input_processor(
+            self.model_config)
+
         self.model_executor = executor_class(
             model_config=model_config,
             cache_config=cache_config,
@@ -484,9 +487,11 @@ def process_model_inputs(
         else:
             prompt_token_ids = inputs["prompt_token_ids"]
 
-        return LLMInputs(prompt_token_ids=prompt_token_ids,
-                         prompt=inputs.get("prompt"),
-                         multi_modal_data=inputs.get("multi_modal_data"))
+        llm_inputs = LLMInputs(prompt_token_ids=prompt_token_ids,
+                               prompt=inputs.get("prompt"),
+                               multi_modal_data=inputs.get("multi_modal_data"))
+
+        return self.input_processor(llm_inputs)
 
     def add_request(
         self,
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index deebb41b03bd..edb4ee1823f0 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,9 +1,10 @@
+import functools
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
-from typing_extensions import Concatenate, ParamSpec
 
 from torch import nn
 from transformers import PretrainedConfig
+from typing_extensions import Concatenate, ParamSpec
 
 from vllm.logger import init_logger
 
@@ -31,6 +32,7 @@
 
 
 def _for_hf(hf_config_type: Type[C]):
+
     def wrapper(
         fn: Callable[Concatenate[C, P], R],
     ) -> Callable[Concatenate["ModelConfig", P], R]:
@@ -54,6 +56,7 @@ def inner(
 
 
 def _for_multimodal_hf(hf_config_type: Type[C]):
+
     def wrapper(
         factory: Callable[Concatenate["VisionLanguageConfig", C, P], R],
     ) -> Callable[Concatenate["ModelConfig", P], R]:
@@ -138,8 +141,8 @@ def for_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            processor: Callable[[C, LLMInputs], LLMInputs],
-        ) -> InputProcessor:
+            processor: Callable[[C, LLMInputs],
+                                LLMInputs], ) -> InputProcessor:
             return _for_hf(hf_config_type)(processor)
 
         return wrapper
@@ -219,11 +222,8 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def dummy_data_for_profiling(
-        self,
-        model_config: "ModelConfig",
-        seq_len: int,
-    ):
+    def dummy_data_for_profiling(self, model_config: "ModelConfig",
+                                 seq_len: int):
         """Create dummy data for memory profiling."""
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
@@ -279,3 +279,10 @@ def process_input(self, model_config: "ModelConfig",
                            f"model class {model_cls.__name__}.")
 
         return processor(model_config, inputs)
+
+    def create_input_processor(self, model_config: ModelConfig):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input, model_config=model_config)

From a2f5a3cda8b7d60fbb52c6ad5e15ada4d6c76747 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 12:56:43 +0000
Subject: [PATCH 007/181] Remove `VisionLanguageConfig` from input mapper

---
 tests/multimodal/test_mapper.py | 43 +++++++++++++++------------------
 vllm/inputs/registry.py         |  2 +-
 vllm/multimodal/base.py         | 21 ++++++----------
 vllm/multimodal/image.py        | 16 ++++++------
 vllm/multimodal/registry.py     | 14 ++++-------
 vllm/worker/cpu_model_runner.py | 16 ++----------
 vllm/worker/model_runner.py     | 15 ++----------
 7 files changed, 43 insertions(+), 84 deletions(-)

diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 13c1d1e342c7..bb327b975476 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -23,15 +23,14 @@ def test_clip_image_processor(hf_images, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ))
 
     for image in hf_images:
         hf_result = hf_processor.preprocess(
@@ -39,9 +38,8 @@ def test_clip_image_processor(hf_images, dtype):
             return_tensors="np",
         )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -65,26 +63,23 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=32000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=576,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ))
 
     for image, tensor in zip(hf_images, vllm_image_tensors):
         image_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
         tensor_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(tensor),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert image_result.keys() == tensor_result.keys()
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index edb4ee1823f0..6efb0d4d2118 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -280,7 +280,7 @@ def process_input(self, model_config: "ModelConfig",
 
         return processor(model_config, inputs)
 
-    def create_input_processor(self, model_config: ModelConfig):
+    def create_input_processor(self, model_config: "ModelConfig"):
         """
         Create an input processor (see :meth:`process_input`) for a
         specific model.
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 9f252af13d36..49f5ad67907e 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -2,7 +2,7 @@
 from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
                     TypeVar)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -32,8 +32,7 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputMapper = Callable[[D, ModelConfig, VisionLanguageConfig],
-                                 Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[ModelConfig, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
@@ -63,9 +62,8 @@ def get_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_mapper(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def _default_input_mapper(self, model_config: ModelConfig,
+                              data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
@@ -99,16 +97,11 @@ def wrapper(model_cls: N) -> N:
 
         return wrapper
 
-    def map_input(
-            self, data: D, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+    def map_input(self, model_config: ModelConfig,
+                  data: D) -> Dict[str, "torch.Tensor"]:
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model, transforming the data into a dictionary of model inputs.
-
-        The model is identified by ``model_config``. ``vlm_config`` is
-        for compatibility purposes and may be merged into ``model_config``
-        in the near future.
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
@@ -120,4 +113,4 @@ def map_input(
             raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return mapper(data, model_config, vlm_config)
+        return mapper(model_config, data)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index e7e1e5bbe93c..606afd412b06 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -227,8 +227,8 @@ class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
     def get_data_type(self) -> Type[ImagePixelData]:
         return ImagePixelData
 
-    def _get_hf_image_processor(self, model_config: ModelConfig,
-                                vlm_config: VisionLanguageConfig):
+    def _get_hf_image_processor(self, model_config: ModelConfig):
+        vlm_config = model_config.multimodal_config
         if vlm_config is None or vlm_config.image_processor is None:
             return None
 
@@ -238,12 +238,10 @@ def _get_hf_image_processor(self, model_config: ModelConfig,
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_mapper(
-            self, data: ImagePixelData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(self, model_config: ModelConfig,
+                              data: ImagePixelData) -> Dict[str, torch.Tensor]:
         image = data.image
-        image_processor = self._get_hf_image_processor(model_config,
-                                                       vlm_config)
+        image_processor = self._get_hf_image_processor(model_config)
 
         if isinstance(image, Image.Image):
             if image_processor is None:
@@ -280,8 +278,8 @@ def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
     def _default_input_mapper(
-            self, data: ImageFeatureData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+            self, model_config: ModelConfig,
+            data: ImageFeatureData) -> Dict[str, torch.Tensor]:
         image_features = data.image_features.to(model_config.dtype)
 
         return {"image_features": image_features}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 13d8059c279a..9b8e3e7d3b89 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -3,7 +3,7 @@
 
 from torch import nn
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
 from .base import MultiModalData, MultiModalInputMapper, MultiModalPlugin
@@ -86,8 +86,7 @@ def register_image_feature_input_mapper(
         """
         return self.register_input_mapper(ImageFeatureData, mapper)
 
-    def map_input(self, data: MultiModalData, model_config: ModelConfig,
-                  vlm_config: VisionLanguageConfig):
+    def map_input(self, model_config: ModelConfig, data: MultiModalData):
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
@@ -95,13 +94,10 @@ def map_input(self, data: MultiModalData, model_config: ModelConfig,
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
         return self._get_plugin_for_data_type(type(data)) \
-            .map_input(data, model_config, vlm_config)
+            .map_input(model_config, data)
 
-    def create_input_mapper(self, model_config: ModelConfig,
-                            vlm_config: VisionLanguageConfig):
+    def create_input_mapper(self, model_config: ModelConfig):
         """
         Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return functools.partial(self.map_input,
-                                 model_config=model_config,
-                                 vlm_config=vlm_config)
+        return functools.partial(self.map_input, model_config=model_config)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index e8c79ce9d9d5..95d8e44f5111 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -66,14 +66,8 @@ def __init__(
         )
 
         # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
-                .create_input_mapper(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_mapper = None
+        self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
@@ -123,12 +117,6 @@ def _prepare_prompt(
 
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data is not None:
-                # Process multi-modal data
-                if self.multi_modal_input_mapper is None:
-                    raise ValueError(
-                        "Multi-modal inputs are only supported by "
-                        "vision language models.")
-
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 for k, v in mm_kwargs.items():
                     multi_modal_kwargs_list[k].append(v)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 94480c9f9095..995751884e9d 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -124,14 +124,8 @@ def __init__(
         )
 
         # Create processor for multi-modal data
-        if self.vision_language_config is not None:
-            self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
-                .create_input_mapper(
-                    self.model_config,
-                    self.vision_language_config,
-                )
-        else:
-            self.multi_modal_input_mapper = None
+        self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+            .create_input_mapper(self.model_config)
 
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
@@ -432,11 +426,6 @@ def _prepare_model_input(
                 mm_data = seq_group_metadata.multi_modal_data
                 if mm_data is not None:
                     # Process multi-modal data
-                    if self.multi_modal_input_mapper is None:
-                        raise ValueError(
-                            "Multi-modal inputs are only supported by "
-                            "vision language models.")
-
                     mm_kwargs = self.multi_modal_input_mapper(mm_data)
                     for k, v in mm_kwargs.items():
                         multi_modal_kwargs_list[k].append(v)

From 378ad80e585bc8e2043b1e267a3e1d052802db8d Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 13:12:50 +0000
Subject: [PATCH 008/181] Fix bad use of `functools.partial`

---
 vllm/inputs/registry.py     | 2 +-
 vllm/multimodal/registry.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 6efb0d4d2118..274273367229 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -285,4 +285,4 @@ def create_input_processor(self, model_config: "ModelConfig"):
         Create an input processor (see :meth:`process_input`) for a
         specific model.
         """
-        return functools.partial(self.process_input, model_config=model_config)
+        return functools.partial(self.process_input, model_config)
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9b8e3e7d3b89..758bf43ca8fd 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -100,4 +100,4 @@ def create_input_mapper(self, model_config: ModelConfig):
         """
         Create an input mapper (see :meth:`map_input`) for a specific model.
         """
-        return functools.partial(self.map_input, model_config=model_config)
+        return functools.partial(self.map_input, model_config)

From 7aa37787978a3f9e1c316caa30f804eb73260965 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 13:37:27 +0000
Subject: [PATCH 009/181] Use default input processor

---
 vllm/inputs/registry.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 274273367229..3dc361500ff6 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -234,7 +234,8 @@ def dummy_data_for_profiling(self, model_config: "ModelConfig",
 
         return dummy_factory(model_config, seq_len)
 
-    def _default_input_processor(self, inputs: LLMInputs) -> LLMInputs:
+    def _default_input_processor(self, model_config: "ModelConfig",
+                                 inputs: LLMInputs) -> LLMInputs:
         """Preprocess the inputs to the model."""
         return inputs
 
@@ -273,10 +274,8 @@ def process_input(self, model_config: "ModelConfig",
 
         model_cls, _ = get_model_architecture(model_config)
 
-        processor = self._input_processors_by_model_type.get(model_cls)
-        if processor is None:
-            raise KeyError(f"No input processor in {self} is registered for "
-                           f"model class {model_cls.__name__}.")
+        processor = self._input_processors_by_model_type \
+            .get(model_cls, self._default_input_processor)
 
         return processor(model_config, inputs)
 

From 532f8634a8632bd7044e1e176f576a0c88c369a5 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 4 Jun 2024 13:21:53 +0000
Subject: [PATCH 010/181] Fix wrong arguments

---
 vllm/worker/model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index d667f07f3f20..54cba07344c7 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -796,7 +796,7 @@ def profile_run(self) -> None:
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
-                .dummy_data_for_profiling(seq_len, model_config)
+                .dummy_data_for_profiling(model_config, seq_len)
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),

From 080d40ca6f7ae7b056d8e4059920e931163e1686 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 07:20:38 +0000
Subject: [PATCH 011/181] Use pillow image instead of tensor to avoid bypassing
 the processor by default

---
 vllm/multimodal/image.py | 13 +++----------
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 606afd412b06..768af71f36f0 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -75,19 +75,12 @@ def _dummy_data_for_clip(
         multi_modal_data: MultiModalData
         if image_input_type == ImageInputType.PIXEL_VALUES:
             width = height = hf_config.image_size
-            if multimodal_config.image_processor is None:
-                values_dtype = torch.float16
-            else:
-                values_dtype = torch.uint8
-
-            values = torch.zeros((1, 3, width, height), dtype=values_dtype)
-            multi_modal_data = ImagePixelData(values)
+            image = Image.new("RGB", (width, height), color=0)
+            multi_modal_data = ImagePixelData(image)
         elif image_input_type == ImageInputType.IMAGE_FEATURES:
             depth = hf_config.hidden_size
-            values_dtype = torch.float16
-
             values = torch.zeros((1, image_feature_size, depth),
-                                 dtype=values_dtype)
+                                 dtype=torch.float16)
             multi_modal_data = ImageFeatureData(values)
 
         return seq_data, multi_modal_data

From 662693a43d89bfa67ce2777c5df583763885e85b Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 09:05:06 +0000
Subject: [PATCH 012/181] Update interface of dummy data factory and input
 processor

---
 vllm/inputs/registry.py                    |  26 ++--
 vllm/multimodal/image.py                   | 155 +++++++++++----------
 vllm/transformers_utils/image_processor.py |   4 -
 3 files changed, 100 insertions(+), 85 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 3dc361500ff6..7588795361a7 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -34,7 +34,7 @@
 def _for_hf(hf_config_type: Type[C]):
 
     def wrapper(
-        fn: Callable[Concatenate[C, P], R],
+        fn: Callable[Concatenate["ModelConfig", C, P], R],
     ) -> Callable[Concatenate["ModelConfig", P], R]:
 
         def inner(
@@ -48,7 +48,7 @@ def inner(
                                 f"Expected type: {hf_config_type}, but "
                                 f"received type: {type(hf_config)}")
 
-            return fn(hf_config, *args, **kwargs)
+            return fn(model_config, hf_config, *args, **kwargs)
 
         return inner
 
@@ -58,7 +58,8 @@ def inner(
 def _for_multimodal_hf(hf_config_type: Type[C]):
 
     def wrapper(
-        factory: Callable[Concatenate["VisionLanguageConfig", C, P], R],
+        factory: Callable[Concatenate["ModelConfig", "VisionLanguageConfig", C,
+                                      P], R],
     ) -> Callable[Concatenate["ModelConfig", P], R]:
 
         def inner(
@@ -76,7 +77,8 @@ def inner(
                                 f"Expected type: {hf_config_type}, but "
                                 f"received type: {type(hf_config)}")
 
-            return factory(multimodal_config, hf_config, *args, **kwargs)
+            return factory(model_config, multimodal_config, hf_config, *args,
+                           **kwargs)
 
         return inner
 
@@ -98,8 +100,9 @@ def for_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            factory: Callable[[C, int], Tuple["SequenceData",
-                                              Optional["MultiModalData"]]],
+            factory: Callable[["ModelConfig", C, int],
+                              Tuple["SequenceData",
+                                    Optional["MultiModalData"]]],
         ) -> DummyDataFactory:
             return _for_hf(hf_config_type)(factory)
 
@@ -117,7 +120,7 @@ def for_multimodal_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            factory: Callable[["VisionLanguageConfig", C, int],
+            factory: Callable[["ModelConfig", "VisionLanguageConfig", C, int],
                               Tuple["SequenceData",
                                     Optional["MultiModalData"]]],
         ) -> DummyDataFactory:
@@ -141,8 +144,8 @@ def for_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            processor: Callable[[C, LLMInputs],
-                                LLMInputs], ) -> InputProcessor:
+            processor: Callable[["ModelConfig", C, LLMInputs], LLMInputs]
+        ) -> InputProcessor:
             return _for_hf(hf_config_type)(processor)
 
         return wrapper
@@ -159,8 +162,9 @@ def for_multimodal_hf(cls, hf_config_type: Type[C]):
         """
 
         def wrapper(
-            processor: Callable[["VisionLanguageConfig", C, LLMInputs],
-                                LLMInputs],
+            processor: Callable[
+                ["ModelConfig", "VisionLanguageConfig", C, LLMInputs],
+                LLMInputs]
         ) -> InputProcessor:
             return _for_multimodal_hf(hf_config_type)(processor)
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 768af71f36f0..4fc3597bbc5b 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,3 +1,4 @@
+from functools import lru_cache
 from typing import Dict, Optional, Tuple, Type, Union
 
 import torch
@@ -11,12 +12,16 @@
 from vllm.inputs.registry import DummyDataFactories, DummyDataFactory
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
-from vllm.transformers_utils.image_processor import cached_get_image_processor
+from vllm.transformers_utils.image_processor import get_image_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from .base import MultiModalData, MultiModalPlugin
 
 logger = init_logger(__name__)
 
+_cached_get_tokenizer = lru_cache(get_tokenizer)
+_cached_get_image_processor = lru_cache(get_image_processor)
+
 
 def _get_dummy_seq_data(
     *,
@@ -34,23 +39,80 @@ def _get_dummy_seq_data(
     return SequenceData(token_ids)
 
 
-class DummyImageDataFactories:
-    """Contains factories for dummy image data factories."""
+def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
+    image_size = hf_config.image_size
+    patch_size = hf_config.patch_size
 
-    @classmethod
-    def _get_clip_num_patches(
-        cls,
-        hf_config: CLIPVisionConfig,
-    ) -> int:
-        image_size = hf_config.image_size
-        patch_size = hf_config.patch_size
+    assert image_size % patch_size == 0
+    return image_size // patch_size
+
+
+def _get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    num_patches = _get_clip_num_patches(hf_config)
+    return num_patches * num_patches
+
+
+def _get_llava_next_num_unpadded_features(
+    height: int,
+    width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = width / height
+    current_aspect_ratio: float = current_width / current_height
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (height * current_width) // width
+        current_height = new_height
+    else:
+        new_width = (width * current_height) // height
+        current_width = new_width
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        num_patches = _get_clip_num_patches(vision_config)
+        base_feature_size = num_patches * num_patches
+
+        # Results in the max possible feature size
+        dummy_height, dummy_width = 448, 448
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(dummy_height, dummy_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_config.image_size,
+        )
 
-        assert image_size % patch_size == 0
-        return image_size // patch_size
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = _get_llava_next_num_unpadded_features(dummy_height, dummy_width,
+                                                  num_patches,
+                                                  num_patch_height,
+                                                  num_patch_width)
+
+        return unpadded_feature_size + newline_feature_size + base_feature_size
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+class DummyImageDataFactories:
+    """Contains factories for dummy image data factories."""
 
     @classmethod
     def _dummy_data_for_clip(
         cls,
+        model_config: ModelConfig,
         multimodal_config: VisionLanguageConfig,
         hf_config: CLIPVisionConfig,
         seq_len: int,
@@ -59,8 +121,7 @@ def _dummy_data_for_clip(
         image_feature_size_override: Optional[int] = None,
     ):
         if image_feature_size_override is None:
-            num_patches = cls._get_clip_num_patches(hf_config)
-            image_feature_size = num_patches * num_patches
+            image_feature_size = _get_clip_image_feature_size(hf_config)
         else:
             image_feature_size = image_feature_size_override
 
@@ -88,6 +149,7 @@ def _dummy_data_for_clip(
     @classmethod
     def _dummy_data_for_llava(
         cls,
+        model_config: ModelConfig,
         multimodal_config: VisionLanguageConfig,
         hf_config: LlavaConfig,
         seq_len: int,
@@ -96,8 +158,9 @@ def _dummy_data_for_llava(
 
         if isinstance(vision_config, CLIPVisionConfig):
             return cls._dummy_data_for_clip(
-                multimodal_config=multimodal_config,
-                hf_config=vision_config,
+                model_config,
+                multimodal_config,
+                vision_config,
                 seq_len=seq_len,
                 image_token_id=hf_config.image_token_index,
             )
@@ -105,70 +168,22 @@ def _dummy_data_for_llava(
         msg = f"Unsupported vision config: {type(vision_config)}"
         raise NotImplementedError(msg)
 
-    @classmethod
-    def _get_llava_next_num_unpadded_features(
-        cls,
-        height: int,
-        width: int,
-        npatches: int,
-        num_patch_height: int,
-        num_patch_width: int,
-    ) -> Tuple[int, int]:
-        # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
-        current_height = npatches * num_patch_height
-        current_width = npatches * num_patch_width
-
-        aspect_ratio: float = width / height
-        current_aspect_ratio: float = current_width / current_height
-        if aspect_ratio > current_aspect_ratio:
-            new_height = (height * current_width) // width
-            current_height = new_height
-        else:
-            new_width = (width * current_height) // height
-            current_width = new_width
-
-        unpadded_features = current_height * current_width
-        newline_features = current_height
-        return (unpadded_features, newline_features)
-
     @classmethod
     def _dummy_data_for_llava_next(
         cls,
+        model_config: ModelConfig,
         multimodal_config: VisionLanguageConfig,
         hf_config: LlavaNextConfig,
         seq_len: int,
     ):
         vision_config = hf_config.vision_config
+        image_feature_size = _get_llava_next_image_feature_size(hf_config)
 
         if isinstance(vision_config, CLIPVisionConfig):
-            num_patches = cls._get_clip_num_patches(vision_config)
-            base_feature_size = num_patches * num_patches
-
-            # Results in the max possible feature size
-            dummy_height, dummy_width = 448, 448
-            num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                image_size=(dummy_height, dummy_width),
-                grid_pinpoints=hf_config.image_grid_pinpoints,
-                patch_size=vision_config.image_size,
-            )
-
-            (
-                unpadded_feature_size,
-                newline_feature_size,
-            ) = cls._get_llava_next_num_unpadded_features(
-                dummy_height,
-                dummy_width,
-                num_patches,
-                num_patch_height,
-                num_patch_width,
-            )
-
-            image_feature_size = unpadded_feature_size + newline_feature_size \
-                + base_feature_size
-
             return cls._dummy_data_for_clip(
-                multimodal_config=multimodal_config,
-                hf_config=vision_config,
+                model_config,
+                multimodal_config,
+                vision_config,
                 seq_len=seq_len,
                 image_token_id=hf_config.image_token_index,
                 image_feature_size_override=image_feature_size,
@@ -225,7 +240,7 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
         if vlm_config is None or vlm_config.image_processor is None:
             return None
 
-        return cached_get_image_processor(
+        return _cached_get_image_processor(
             vlm_config.image_processor,
             trust_remote_code=model_config.trust_remote_code,
             revision=vlm_config.image_processor_revision,
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3239b1d0cfa2..2bb5215d4846 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,4 +1,3 @@
-from functools import lru_cache
 from typing import Optional
 
 from transformers import AutoImageProcessor
@@ -40,6 +39,3 @@ def get_image_processor(
             raise e
 
     return processor
-
-
-cached_get_image_processor = lru_cache(get_image_processor)

From 9bc5fcc620dca0e47d3c51ccb8b1a71af03f62be Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 09:53:08 +0000
Subject: [PATCH 013/181] Use `InputContext` to handle checked type cast of
 config types

---
 vllm/inputs/registry.py  | 182 ++++++++-------------------------------
 vllm/multimodal/base.py  |   7 +-
 vllm/multimodal/image.py |  26 ++++--
 3 files changed, 57 insertions(+), 158 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 7588795361a7..3bdfa1122b3a 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,10 +1,10 @@
 import functools
+from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
 
 from torch import nn
 from transformers import PretrainedConfig
-from typing_extensions import Concatenate, ParamSpec
 
 from vllm.logger import init_logger
 
@@ -17,158 +17,43 @@
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound="MultiModalData")
-N = TypeVar("N", bound=Type[nn.Module])
-
-DummyDataFactory = Callable[["ModelConfig", int],
-                            Tuple["SequenceData", Optional["MultiModalData"]]]
-"""Create dummy data to be inputted into the model."""
-
-InputProcessor = Callable[["ModelConfig", LLMInputs], LLMInputs]
-"""Preprocess the inputs to the model."""
-
-P, R = ParamSpec("P"), TypeVar("R")
 C = TypeVar("C", bound=PretrainedConfig)
 
 
-def _for_hf(hf_config_type: Type[C]):
-
-    def wrapper(
-        fn: Callable[Concatenate["ModelConfig", C, P], R],
-    ) -> Callable[Concatenate["ModelConfig", P], R]:
-
-        def inner(
-            model_config: "ModelConfig",
-            *args: P.args,
-            **kwargs: P.kwargs,
-        ) -> R:
-            hf_config = model_config.hf_config
-            if not isinstance(hf_config, hf_config_type):
-                raise TypeError("Invalid type of HuggingFace config. "
-                                f"Expected type: {hf_config_type}, but "
-                                f"received type: {type(hf_config)}")
-
-            return fn(model_config, hf_config, *args, **kwargs)
-
-        return inner
-
-    return wrapper
-
-
-def _for_multimodal_hf(hf_config_type: Type[C]):
-
-    def wrapper(
-        factory: Callable[Concatenate["ModelConfig", "VisionLanguageConfig", C,
-                                      P], R],
-    ) -> Callable[Concatenate["ModelConfig", P], R]:
-
-        def inner(
-            model_config: "ModelConfig",
-            *args: P.args,
-            **kwargs: P.kwargs,
-        ) -> R:
-            multimodal_config = model_config.multimodal_config
-            if multimodal_config is None:
-                raise ValueError("No multimodal config found")
-
-            hf_config = model_config.hf_config
-            if not isinstance(hf_config, hf_config_type):
-                raise TypeError("Invalid type of HuggingFace config. "
-                                f"Expected type: {hf_config_type}, but "
-                                f"received type: {type(hf_config)}")
-
-            return factory(model_config, multimodal_config, hf_config, *args,
-                           **kwargs)
-
-        return inner
-
-    return wrapper
-
-
-class DummyDataFactories:
-    """Contains factories for dummy data factories."""
-
-    @classmethod
-    def for_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate a dummy data factory that uses a specific type of
-        HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`DummyDataFactory`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
-
-        def wrapper(
-            factory: Callable[["ModelConfig", C, int],
-                              Tuple["SequenceData",
-                                    Optional["MultiModalData"]]],
-        ) -> DummyDataFactory:
-            return _for_hf(hf_config_type)(factory)
+@dataclass(frozen=True)
+class InputContext:
+    model_config: "ModelConfig"
 
-        return wrapper
+    def get_multimodal_config(self) -> "VisionLanguageConfig":
+        multimodal_config = self.model_config.multimodal_config
+        if multimodal_config is None:
+            raise ValueError("No multimodal config found")
 
-    @classmethod
-    def for_multimodal_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate a dummy data factory that uses multimodal config as well
-        as a specific type of HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`DummyDataFactory`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+        return multimodal_config
 
-        def wrapper(
-            factory: Callable[["ModelConfig", "VisionLanguageConfig", C, int],
-                              Tuple["SequenceData",
-                                    Optional["MultiModalData"]]],
-        ) -> DummyDataFactory:
-            return _for_multimodal_hf(hf_config_type)(factory)
+    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
 
-        return wrapper
-
-
-class InputProcessors:
-    """Contains factories for input processors."""
-
-    @classmethod
-    def for_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate an input processor that uses a specific type of
-        HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`InputProcessor`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+        return hf_config
 
-        def wrapper(
-            processor: Callable[["ModelConfig", C, LLMInputs], LLMInputs]
-        ) -> InputProcessor:
-            return _for_hf(hf_config_type)(processor)
 
-        return wrapper
+N = TypeVar("N", bound=Type[nn.Module])
 
-    @classmethod
-    def for_multimodal_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate an input processor that uses multimodal config as well
-        as a specific type of HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`InputProcessor`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+DummyDataFactory = Callable[[InputContext, int],
+                            Tuple["SequenceData", Optional["MultiModalData"]]]
+"""
+Create dummy data to be inputted into the model.
 
-        def wrapper(
-            processor: Callable[
-                ["ModelConfig", "VisionLanguageConfig", C, LLMInputs],
-                LLMInputs]
-        ) -> InputProcessor:
-            return _for_multimodal_hf(hf_config_type)(processor)
+Note:
+    :data:`InputProcessor` is not applied to the dummy data.
+"""
 
-        return wrapper
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
 
 
 class InputRegistry:
@@ -192,10 +77,15 @@ def MULTIMODAL(self) -> "MultiModalRegistry":
 
     def _default_dummy_data_factory(
         self,
-        model_config: "ModelConfig",
+        ctx: InputContext,
         seq_len: int,
     ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
-        """Create dummy data to be inputted into the model."""
+        """
+        Create dummy data to be inputted into the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
         # Avoid circular import
         from vllm.sequence import SequenceData
 
@@ -236,9 +126,9 @@ def dummy_data_for_profiling(self, model_config: "ModelConfig",
         dummy_factory = self._dummy_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
-        return dummy_factory(model_config, seq_len)
+        return dummy_factory(InputContext(model_config), seq_len)
 
-    def _default_input_processor(self, model_config: "ModelConfig",
+    def _default_input_processor(self, ctx: InputContext,
                                  inputs: LLMInputs) -> LLMInputs:
         """Preprocess the inputs to the model."""
         return inputs
@@ -281,7 +171,7 @@ def process_input(self, model_config: "ModelConfig",
         processor = self._input_processors_by_model_type \
             .get(model_cls, self._default_input_processor)
 
-        return processor(model_config, inputs)
+        return processor(InputContext(model_config), inputs)
 
     def create_input_processor(self, model_config: "ModelConfig"):
         """
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 49f5ad67907e..94ee43d118d3 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -3,6 +3,7 @@
                     TypeVar)
 
 from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -32,7 +33,7 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputMapper = Callable[[ModelConfig, D], Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
@@ -62,7 +63,7 @@ def get_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_mapper(self, model_config: ModelConfig,
+    def _default_input_mapper(self, ctx: InputContext,
                               data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`torch.nn.Module.forward`. This is similar in concept to
@@ -113,4 +114,4 @@ def map_input(self, model_config: ModelConfig,
             raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return mapper(model_config, data)
+        return mapper(InputContext(model_config), data)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 4fc3597bbc5b..05c04979aaf5 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -9,17 +9,15 @@
     get_anyres_image_grid_shape)
 
 from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.inputs.registry import DummyDataFactories, DummyDataFactory
+from vllm.inputs.registry import DummyDataFactory, InputContext
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
-from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from .base import MultiModalData, MultiModalPlugin
 
 logger = init_logger(__name__)
 
-_cached_get_tokenizer = lru_cache(get_tokenizer)
 _cached_get_image_processor = lru_cache(get_image_processor)
 
 
@@ -202,11 +200,19 @@ def for_model(
         by the config type.
         """
         if hf_config_type == LlavaConfig:
-            return DummyDataFactories.for_multimodal_hf(LlavaConfig) \
-                (cls._dummy_data_for_llava)
+            return lambda ctx, seq_len: cls._dummy_data_for_llava(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaConfig),
+                seq_len=seq_len,
+            )
         if hf_config_type == LlavaNextConfig:
-            return DummyDataFactories.for_multimodal_hf(LlavaNextConfig) \
-                (cls._dummy_data_for_llava_next)
+            return lambda ctx, seq_len: cls._dummy_data_for_llava_next(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaNextConfig),
+                seq_len=seq_len,
+            )
 
         msg = f"Unsupported model config: {type(hf_config_type)}"
         raise NotImplementedError(msg)
@@ -246,8 +252,9 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_mapper(self, model_config: ModelConfig,
+    def _default_input_mapper(self, ctx: InputContext,
                               data: ImagePixelData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image = data.image
         image_processor = self._get_hf_image_processor(model_config)
 
@@ -286,8 +293,9 @@ def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
     def _default_input_mapper(
-            self, model_config: ModelConfig,
+            self, ctx: InputContext,
             data: ImageFeatureData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image_features = data.image_features.to(model_config.dtype)
 
         return {"image_features": image_features}

From 911cac7411ecdcc4af495943a51d13cbd19e7a61 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 11:06:00 +0000
Subject: [PATCH 014/181] Add input processor for injecting image tokens; fix
 docs

---
 tests/conftest.py                   |   9 -
 tests/models/test_llava.py          |   6 +-
 vllm/inputs/data.py                 |   5 +
 vllm/inputs/registry.py             | 186 +++++---------------
 vllm/model_executor/models/llava.py |   4 +-
 vllm/multimodal/base.py             |  11 +-
 vllm/multimodal/image.py            | 254 ++++++++++++++++++++++++----
 7 files changed, 277 insertions(+), 198 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 764374a779d9..8301b4cf6108 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -113,15 +113,6 @@ def vllm_image_tensors(request) -> List[torch.Tensor]:
     return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]
 
 
-@pytest.fixture()
-def vllm_image_prompts(request) -> List[str]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    return [
-        "<image>" * (vision_language_config.image_feature_size - 1) + p
-        for p in _IMAGE_PROMPTS
-    ]
-
-
 @pytest.fixture
 def example_prompts() -> List[str]:
     prompts = []
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index cc0685ca9c5e..a85a373e73ed 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -81,8 +81,8 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
-                vllm_image_prompts, vllm_images, model_and_config, dtype: str,
-                max_tokens: int, worker_use_ray: bool) -> None:
+                vllm_images, model_and_config, dtype: str, max_tokens: int,
+                worker_use_ray: bool) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -105,7 +105,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
                              worker_use_ray=worker_use_ray,
                              enforce_eager=True,
                              **as_dict(vision_language_config))
-    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+    vllm_outputs = vllm_model.generate_greedy(hf_image_prompts,
                                               max_tokens,
                                               images=vllm_images)
     del vllm_model
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85c9cd84f5ed..2c600e9793f3 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -125,6 +125,11 @@ class TextTokensPrompt(TypedDict):
 
 
 class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+    """
+
     prompt_token_ids: List[int]
     prompt: NotRequired[Optional[str]]
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 7588795361a7..78f1fdea8945 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -1,10 +1,10 @@
 import functools
+from dataclasses import dataclass
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
 
 from torch import nn
 from transformers import PretrainedConfig
-from typing_extensions import Concatenate, ParamSpec
 
 from vllm.logger import init_logger
 
@@ -17,164 +17,49 @@
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound="MultiModalData")
-N = TypeVar("N", bound=Type[nn.Module])
-
-DummyDataFactory = Callable[["ModelConfig", int],
-                            Tuple["SequenceData", Optional["MultiModalData"]]]
-"""Create dummy data to be inputted into the model."""
-
-InputProcessor = Callable[["ModelConfig", LLMInputs], LLMInputs]
-"""Preprocess the inputs to the model."""
-
-P, R = ParamSpec("P"), TypeVar("R")
 C = TypeVar("C", bound=PretrainedConfig)
 
 
-def _for_hf(hf_config_type: Type[C]):
-
-    def wrapper(
-        fn: Callable[Concatenate["ModelConfig", C, P], R],
-    ) -> Callable[Concatenate["ModelConfig", P], R]:
-
-        def inner(
-            model_config: "ModelConfig",
-            *args: P.args,
-            **kwargs: P.kwargs,
-        ) -> R:
-            hf_config = model_config.hf_config
-            if not isinstance(hf_config, hf_config_type):
-                raise TypeError("Invalid type of HuggingFace config. "
-                                f"Expected type: {hf_config_type}, but "
-                                f"received type: {type(hf_config)}")
-
-            return fn(model_config, hf_config, *args, **kwargs)
-
-        return inner
-
-    return wrapper
-
-
-def _for_multimodal_hf(hf_config_type: Type[C]):
-
-    def wrapper(
-        factory: Callable[Concatenate["ModelConfig", "VisionLanguageConfig", C,
-                                      P], R],
-    ) -> Callable[Concatenate["ModelConfig", P], R]:
-
-        def inner(
-            model_config: "ModelConfig",
-            *args: P.args,
-            **kwargs: P.kwargs,
-        ) -> R:
-            multimodal_config = model_config.multimodal_config
-            if multimodal_config is None:
-                raise ValueError("No multimodal config found")
-
-            hf_config = model_config.hf_config
-            if not isinstance(hf_config, hf_config_type):
-                raise TypeError("Invalid type of HuggingFace config. "
-                                f"Expected type: {hf_config_type}, but "
-                                f"received type: {type(hf_config)}")
-
-            return factory(model_config, multimodal_config, hf_config, *args,
-                           **kwargs)
-
-        return inner
-
-    return wrapper
-
-
-class DummyDataFactories:
-    """Contains factories for dummy data factories."""
-
-    @classmethod
-    def for_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate a dummy data factory that uses a specific type of
-        HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`DummyDataFactory`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
-
-        def wrapper(
-            factory: Callable[["ModelConfig", C, int],
-                              Tuple["SequenceData",
-                                    Optional["MultiModalData"]]],
-        ) -> DummyDataFactory:
-            return _for_hf(hf_config_type)(factory)
+@dataclass(frozen=True)
+class InputContext:
+    model_config: "ModelConfig"
 
-        return wrapper
+    def get_multimodal_config(self) -> "VisionLanguageConfig":
+        multimodal_config = self.model_config.multimodal_config
+        if multimodal_config is None:
+            raise ValueError("No multimodal config found")
 
-    @classmethod
-    def for_multimodal_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate a dummy data factory that uses multimodal config as well
-        as a specific type of HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`DummyDataFactory`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+        return multimodal_config
 
-        def wrapper(
-            factory: Callable[["ModelConfig", "VisionLanguageConfig", C, int],
-                              Tuple["SequenceData",
-                                    Optional["MultiModalData"]]],
-        ) -> DummyDataFactory:
-            return _for_multimodal_hf(hf_config_type)(factory)
+    def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        hf_config = self.model_config.hf_config
+        if not isinstance(hf_config, hf_config_type):
+            raise TypeError("Invalid type of HuggingFace config. "
+                            f"Expected type: {hf_config_type}, but "
+                            f"found type: {type(hf_config)}")
 
-        return wrapper
-
-
-class InputProcessors:
-    """Contains factories for input processors."""
-
-    @classmethod
-    def for_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate an input processor that uses a specific type of
-        HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`InputProcessor`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+        return hf_config
 
-        def wrapper(
-            processor: Callable[["ModelConfig", C, LLMInputs], LLMInputs]
-        ) -> InputProcessor:
-            return _for_hf(hf_config_type)(processor)
 
-        return wrapper
+N = TypeVar("N", bound=Type[nn.Module])
 
-    @classmethod
-    def for_multimodal_hf(cls, hf_config_type: Type[C]):
-        """
-        Decorate an input processor that uses multimodal config as well
-        as a specific type of HuggingFace config.
-        
-        The returned function satisfies the interface of
-        :data:`InputProcessor`, with runtime checks being made to ensure
-        the validity of the inputs.
-        """
+DummyDataFactory = Callable[[InputContext, int],
+                            Tuple["SequenceData", Optional["MultiModalData"]]]
+"""
+Create dummy data to be inputted into the model.
 
-        def wrapper(
-            processor: Callable[
-                ["ModelConfig", "VisionLanguageConfig", C, LLMInputs],
-                LLMInputs]
-        ) -> InputProcessor:
-            return _for_multimodal_hf(hf_config_type)(processor)
+Note:
+    :data:`InputProcessor` is not applied to the dummy data.
+"""
 
-        return wrapper
+InputProcessor = Callable[[InputContext, LLMInputs], LLMInputs]
+"""Preprocess the inputs to the model."""
 
 
 class InputRegistry:
     """
-    This registry is used by model runners to dispatch data processing
-    according to its modality and the target model.
+    This registry is used by :class:`~vllm.LLMEngine` to dispatch data
+    processing according to the target model.
     """
 
     def __init__(self, *, multimodal_registry: "MultiModalRegistry") -> None:
@@ -192,10 +77,15 @@ def MULTIMODAL(self) -> "MultiModalRegistry":
 
     def _default_dummy_data_factory(
         self,
-        model_config: "ModelConfig",
+        ctx: InputContext,
         seq_len: int,
     ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
-        """Create dummy data to be inputted into the model."""
+        """
+        Create dummy data to be inputted into the model.
+
+        Note:
+            :data:`InputProcessor` is not applied to the dummy data.
+        """
         # Avoid circular import
         from vllm.sequence import SequenceData
 
@@ -236,9 +126,9 @@ def dummy_data_for_profiling(self, model_config: "ModelConfig",
         dummy_factory = self._dummy_factories_by_model_type \
             .get(model_cls, self._default_dummy_data_factory)
 
-        return dummy_factory(model_config, seq_len)
+        return dummy_factory(InputContext(model_config), seq_len)
 
-    def _default_input_processor(self, model_config: "ModelConfig",
+    def _default_input_processor(self, ctx: InputContext,
                                  inputs: LLMInputs) -> LLMInputs:
         """Preprocess the inputs to the model."""
         return inputs
@@ -281,7 +171,7 @@ def process_input(self, model_config: "ModelConfig",
         processor = self._input_processors_by_model_type \
             .get(model_cls, self._default_input_processor)
 
-        return processor(model_config, inputs)
+        return processor(InputContext(model_config), inputs)
 
     def create_input_processor(self, model_config: "ModelConfig"):
         """
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 91edee8b804f..172fa81ff5cf 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,7 +18,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal.image import DummyImageDataFactories
+from vllm.multimodal.image import DummyImageDataFactories, ImageInputProcessors
 from vllm.sequence import SamplerOutput
 
 from .vlm_base import VisionLanguageModelBase
@@ -88,6 +88,8 @@ class LlavaImageFeatureInputs(TypedDict):
 @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaConfig))
+@INPUT_REGISTRY.register_input_processor(
+    ImageInputProcessors.for_model(LlavaConfig))
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 49f5ad67907e..6b4684a54d33 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -3,6 +3,7 @@
                     TypeVar)
 
 from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
@@ -32,9 +33,9 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-MultiModalInputMapper = Callable[[ModelConfig, D], Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
-:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
@@ -62,10 +63,10 @@ def get_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def _default_input_mapper(self, model_config: ModelConfig,
+    def _default_input_mapper(self, ctx: InputContext,
                               data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
-        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
         """
         raise NotImplementedError
@@ -113,4 +114,4 @@ def map_input(self, model_config: ModelConfig,
             raise KeyError(f"No input mapper in {self} is registered for "
                            f"model class {model_cls.__name__}.")
 
-        return mapper(model_config, data)
+        return mapper(InputContext(model_config), data)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 4fc3597bbc5b..ca592c7ce878 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,5 +1,6 @@
 from functools import lru_cache
-from typing import Dict, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Dict, List, Optional, Tuple, Type, TypeVar,
+                    Union)
 
 import torch
 from PIL import Image
@@ -9,7 +10,7 @@
     get_anyres_image_grid_shape)
 
 from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.inputs.registry import DummyDataFactories, DummyDataFactory
+from vllm.inputs.registry import DummyDataFactory, InputContext, InputProcessor
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
@@ -17,28 +18,17 @@
 
 from .base import MultiModalData, MultiModalPlugin
 
+if TYPE_CHECKING:
+    from vllm.inputs import LLMInputs
+else:
+    LLMInputs = dict
+
 logger = init_logger(__name__)
 
 _cached_get_tokenizer = lru_cache(get_tokenizer)
 _cached_get_image_processor = lru_cache(get_image_processor)
 
 
-def _get_dummy_seq_data(
-    *,
-    seq_len: int,
-    image_token_id: int,
-    image_feature_size: int,
-) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [image_token_id] * image_feature_size
-    token_ids += [0] * (seq_len - image_feature_size)
-
-    return SequenceData(token_ids)
-
-
 def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
     image_size = hf_config.image_size
     patch_size = hf_config.patch_size
@@ -107,7 +97,12 @@ def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
 
 
 class DummyImageDataFactories:
-    """Contains factories for dummy image data factories."""
+    """
+    Contains factories for dummy image data factories.
+
+    See Also:
+        :data:`vllm.inputs.registry.DummyDataFactory`
+    """
 
     @classmethod
     def _dummy_data_for_clip(
@@ -125,11 +120,9 @@ def _dummy_data_for_clip(
         else:
             image_feature_size = image_feature_size_override
 
-        seq_data = _get_dummy_seq_data(
-            seq_len=seq_len,
-            image_token_id=image_token_id,
-            image_feature_size=image_feature_size,
-        )
+        token_ids = [image_token_id] * image_feature_size
+        token_ids += [0] * (seq_len - image_feature_size)
+        seq_data = SequenceData(token_ids)
 
         image_input_type = multimodal_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
@@ -202,11 +195,206 @@ def for_model(
         by the config type.
         """
         if hf_config_type == LlavaConfig:
-            return DummyDataFactories.for_multimodal_hf(LlavaConfig) \
-                (cls._dummy_data_for_llava)
+            return lambda ctx, seq_len: cls._dummy_data_for_llava(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaConfig),
+                seq_len=seq_len,
+            )
+        if hf_config_type == LlavaNextConfig:
+            return lambda ctx, seq_len: cls._dummy_data_for_llava_next(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaNextConfig),
+                seq_len=seq_len,
+            )
+
+        msg = f"Unsupported model config: {type(hf_config_type)}"
+        raise NotImplementedError(msg)
+
+
+_T = TypeVar("_T", str, int)
+
+
+class ImageInputProcessors:
+    """
+    Contains factories for image input processors.
+
+    See Also:
+        :data:`vllm.inputs.registry.InputProcessor`
+    """
+
+    @classmethod
+    def _repeat_and_pad_token(
+        cls,
+        token: _T,
+        *,
+        repeat_count: int = 1,
+        pad_token_left: Optional[_T] = None,
+        pad_token_right: Optional[_T] = None,
+    ) -> List[_T]:
+        replacement = [token] * repeat_count
+        if pad_token_left is not None:
+            replacement = [pad_token_left] + replacement
+        if pad_token_right is not None:
+            replacement = replacement + [pad_token_right]
+
+        return replacement
+
+    @classmethod
+    def _repeat_and_pad_image_tokens(
+        cls,
+        model_config: ModelConfig,
+        llm_inputs: LLMInputs,
+        *,
+        image_token_id: int,
+        repeat_count: int = 1,
+        pad_token_left: Optional[int] = None,
+        pad_token_right: Optional[int] = None,
+    ) -> LLMInputs:
+        multi_modal_data = llm_inputs.get("multi_modal_data")
+        if multi_modal_data is None:
+            return llm_inputs
+
+        tokenizer = _cached_get_tokenizer(model_config.tokenizer)
+        image_token_str = tokenizer.decode(image_token_id)
+        pad_token_str_left = (None if pad_token_left is None else
+                              tokenizer.decode(pad_token_left))
+        pad_token_str_right = (None if pad_token_right is None else
+                               tokenizer.decode(pad_token_right))
+
+        replacement_str = "".join(
+            cls._repeat_and_pad_token(
+                image_token_str,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_str_left,
+                pad_token_right=pad_token_str_right,
+            ))
+        replacement_ids = cls._repeat_and_pad_token(
+            image_token_id,
+            repeat_count=repeat_count,
+            pad_token_left=pad_token_left,
+            pad_token_right=pad_token_right,
+        )
+
+        # To avoid invoking the tokenizer, we assume that the
+        # image token is called "<image>"
+        prompt = llm_inputs.get("prompt")
+        if prompt is None:
+            new_prompt = None
+        else:
+            # The image tokens are removed to be consistent with HuggingFace
+            new_prompt = prompt.replace(image_token_str, replacement_str, 1)
+
+        prompt_token_ids = llm_inputs["prompt_token_ids"]
+        new_token_ids: List[int] = []
+        for i, token in enumerate(prompt_token_ids):
+            if token == image_token_id:
+                new_token_ids.extend(replacement_ids)
+
+                # No need to further scan the list since we only replace once
+                new_token_ids.extend(prompt_token_ids[i + 1:])
+                break
+            else:
+                new_token_ids.append(token)
+
+        # NOTE: Create a defensive copy of the original inputs
+        return LLMInputs(prompt_token_ids=new_token_ids,
+                         prompt=new_prompt,
+                         multi_modal_data=multi_modal_data)
+
+    @classmethod
+    def _input_processor_for_clip(
+        cls,
+        model_config: ModelConfig,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: CLIPVisionConfig,
+        llm_inputs: LLMInputs,
+        *,
+        image_token_id: int,
+        image_feature_size_override: Optional[int] = None,
+    ):
+        if image_feature_size_override is None:
+            image_feature_size = _get_clip_image_feature_size(hf_config)
+        else:
+            image_feature_size = image_feature_size_override
+
+        return cls._repeat_and_pad_image_tokens(
+            model_config,
+            llm_inputs,
+            image_token_id=image_token_id,
+            repeat_count=image_feature_size,
+        )
+
+    @classmethod
+    def _input_processor_for_llava(
+        cls,
+        model_config: ModelConfig,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: LlavaConfig,
+        llm_inputs: LLMInputs,
+    ):
+        vision_config = hf_config.vision_config
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            return cls._input_processor_for_clip(
+                model_config,
+                multimodal_config,
+                vision_config,
+                llm_inputs,
+                image_token_id=hf_config.image_token_index,
+            )
+
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
+
+    @classmethod
+    def _input_processor_for_llava_next(
+        cls,
+        model_config: ModelConfig,
+        multimodal_config: VisionLanguageConfig,
+        hf_config: LlavaNextConfig,
+        llm_inputs: LLMInputs,
+    ):
+        vision_config = hf_config.vision_config
+        image_feature_size = _get_llava_next_image_feature_size(hf_config)
+
+        if isinstance(vision_config, CLIPVisionConfig):
+            return cls._input_processor_for_clip(
+                model_config,
+                multimodal_config,
+                vision_config,
+                llm_inputs,
+                image_token_id=hf_config.image_token_index,
+                image_feature_size_override=image_feature_size,
+            )
+
+        msg = f"Unsupported vision config: {type(vision_config)}"
+        raise NotImplementedError(msg)
+
+    @classmethod
+    def for_model(
+        cls,
+        hf_config_type: Type[PretrainedConfig],
+    ) -> InputProcessor:
+        """
+        Create an input processor for a model as identified
+        by the config type.
+        """
+        if hf_config_type == LlavaConfig:
+            return lambda ctx, llm_inputs: cls._input_processor_for_llava(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaConfig),
+                llm_inputs=llm_inputs,
+            )
         if hf_config_type == LlavaNextConfig:
-            return DummyDataFactories.for_multimodal_hf(LlavaNextConfig) \
-                (cls._dummy_data_for_llava_next)
+            return lambda ctx, llm_inputs: cls._input_processor_for_llava_next(
+                ctx.model_config,
+                ctx.get_multimodal_config(),
+                ctx.get_hf_config(LlavaNextConfig),
+                llm_inputs=llm_inputs,
+            )
 
         msg = f"Unsupported model config: {type(hf_config_type)}"
         raise NotImplementedError(msg)
@@ -216,9 +404,9 @@ class ImagePixelData(MultiModalData):
     """
     The pixel data of an image. Can be one of:
 
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+    - :class:`PIL.Image.Image`: An image object. Requires that a HuggingFace
       processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+    - :class:`torch.Tensor`: The raw pixel data which is passed to the model
       without additional pre-processing.
     """
 
@@ -246,8 +434,9 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
             revision=vlm_config.image_processor_revision,
         )
 
-    def _default_input_mapper(self, model_config: ModelConfig,
+    def _default_input_mapper(self, ctx: InputContext,
                               data: ImagePixelData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image = data.image
         image_processor = self._get_hf_image_processor(model_config)
 
@@ -286,8 +475,9 @@ def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
     def _default_input_mapper(
-            self, model_config: ModelConfig,
+            self, ctx: InputContext,
             data: ImageFeatureData) -> Dict[str, torch.Tensor]:
+        model_config = ctx.model_config
         image_features = data.image_features.to(model_config.dtype)
 
         return {"image_features": image_features}

From a38b347e079e8258aa2435f96735578c003ed9d1 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 11:06:05 +0000
Subject: [PATCH 015/181] Add new documentation pages

---
 .../input_processing/model_inputs_index.rst   | 28 ++++++
 .../multimodal/adding_multimodal_model.rst    | 94 +++++++++++++++++++
 .../dev/multimodal/multimodal_index.rst       | 14 ++-
 docs/source/index.rst                         |  1 +
 docs/source/models/adding_model.rst           |  4 +-
 5 files changed, 136 insertions(+), 5 deletions(-)
 create mode 100644 docs/source/dev/input_processing/model_inputs_index.rst
 create mode 100644 docs/source/dev/multimodal/adding_multimodal_model.rst

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
new file mode 100644
index 000000000000..8529fc1d9af4
--- /dev/null
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,28 @@
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors.
+
+.. contents::
+   :local:
+   :backlinks: none
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
new file mode 100644
index 000000000000..4a0010d47ba3
--- /dev/null
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -0,0 +1,94 @@
+.. _adding_a_new_multimodal_model:
+
+Adding a New Multimodal Model
+=============================
+
+This document provides a high-level guide on integrating a :ref:`multimodal model <multi_modality>` into vLLM.
+
+.. note::
+    The complexity of adding a new model depends heavily on the model's architecture.
+    The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
+    However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
+
+.. tip::
+    If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
+    We will be happy to help you out!
+
+
+0. Set up a base vLLM model
+---------------------------
+
+Follow :ref:`these steps <adding_a_new_model>` to first implement the model in vLLM.
+While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+.. code-block:: diff
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+    +   pixel_values: torch.Tensor,
+    ) -> SamplerOutput:
+
+.. note::
+    The model class does not have to be named :code:`*ForCausalLM`.
+    Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+
+1. Register input mappers
+-------------------------
+
+For each modality type to support, decorate the model class with :meth:`vllm.INPUT_REGISTRY.MULTIMODAL.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
+
+.. code-block:: diff
+
+    + from vllm.inputs import INPUT_REGISTRY
+
+    + @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    + @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    class YourModelForImage2Seq(nn.Module):
+
+A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
+
+
+2. (Optional) Register dummy data
+---------------------------------
+
+During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`vllm.inputs.INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+
+    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    class YourModelForImage2Seq(nn.Module):
+
+Refer to :class:`vllm.multimodal.image.DummyImageDataFactories` for some examples of dummy data factories.
+
+
+3. (Optional) Register input processor
+--------------------------------------
+
+Sometimes, there is a need to process inputs at the :class:~vllm.LLMEngine` level before they are passed to the model executor.
+You can register input processors via :meth:`vllm.inputs.INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+
+.. code-block:: diff
+
+    from vllm.inputs import INPUT_REGISTRY
+
+    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
+    + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
+    class YourModelForImage2Seq(nn.Module):
+
+A common use case of input processors is inserting extra image tokens to leverage the vLLM framework for attention mask generation.
+More details can be found in :class:`vllm.multimodal.image.ImageInputProcessors`.
+
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 719d6e12ddfd..ad5ba07aefd9 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -1,3 +1,5 @@
+.. _multi_modality:
+
 Multi-Modality
 ==============
 
@@ -8,9 +10,15 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`this guide <adding_a_new_multimodal_model>`.
+
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   adding_multimodal_model
 
 .. contents::
    :local:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index fad3c3b05b0c..dcca28b3b88c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -107,6 +107,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index cbc8099e6f70..f282b594590b 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.

From 29c3bb32a2731114ef15dc077d1f2088c4ac33cb Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 13:05:58 +0000
Subject: [PATCH 016/181] Fix LLaVA-NeXT input processor and cleanup code

---
 vllm/multimodal/image.py | 134 ++++++++++++++++++++++++---------------
 1 file changed, 82 insertions(+), 52 deletions(-)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 05c04979aaf5..80e0c50d1097 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -21,22 +21,6 @@
 _cached_get_image_processor = lru_cache(get_image_processor)
 
 
-def _get_dummy_seq_data(
-    *,
-    seq_len: int,
-    image_token_id: int,
-    image_feature_size: int,
-) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [image_token_id] * image_feature_size
-    token_ids += [0] * (seq_len - image_feature_size)
-
-    return SequenceData(token_ids)
-
-
 def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
     image_size = hf_config.image_size
     patch_size = hf_config.patch_size
@@ -75,17 +59,20 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
-def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
+def _get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
         num_patches = _get_clip_num_patches(vision_config)
         base_feature_size = num_patches * num_patches
 
-        # Results in the max possible feature size
-        dummy_height, dummy_width = 448, 448
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-            image_size=(dummy_height, dummy_width),
+            image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
         )
@@ -93,7 +80,7 @@ def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
         (
             unpadded_feature_size,
             newline_feature_size,
-        ) = _get_llava_next_num_unpadded_features(dummy_height, dummy_width,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
                                                   num_patches,
                                                   num_patch_height,
                                                   num_patch_width)
@@ -108,10 +95,8 @@ class DummyImageDataFactories:
     """Contains factories for dummy image data factories."""
 
     @classmethod
-    def _dummy_data_for_clip(
+    def _dummy_seq_data_for_clip(
         cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
         hf_config: CLIPVisionConfig,
         seq_len: int,
         *,
@@ -123,26 +108,42 @@ def _dummy_data_for_clip(
         else:
             image_feature_size = image_feature_size_override
 
-        seq_data = _get_dummy_seq_data(
-            seq_len=seq_len,
-            image_token_id=image_token_id,
-            image_feature_size=image_feature_size,
-        )
+        token_ids = [image_token_id] * image_feature_size
+        token_ids += [0] * (seq_len - image_feature_size)
+        return SequenceData(token_ids)
 
-        image_input_type = multimodal_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-        multi_modal_data: MultiModalData
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            width = height = hf_config.image_size
-            image = Image.new("RGB", (width, height), color=0)
-            multi_modal_data = ImagePixelData(image)
-        elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            depth = hf_config.hidden_size
-            values = torch.zeros((1, image_feature_size, depth),
-                                 dtype=torch.float16)
-            multi_modal_data = ImageFeatureData(values)
-
-        return seq_data, multi_modal_data
+    @classmethod
+    def _dummy_pixel_data_for_clip(
+        cls,
+        hf_config: CLIPVisionConfig,
+        *,
+        image_width_override: Optional[int] = None,
+        image_height_override: Optional[int] = None,
+    ):
+        width = height = hf_config.image_size
+        if image_width_override is not None:
+            width = image_width_override
+        if image_height_override is not None:
+            height = image_height_override
+
+        image = Image.new("RGB", (width, height), color=0)
+        return ImagePixelData(image)
+
+    @classmethod
+    def _dummy_feature_data_for_clip(
+        cls,
+        hf_config: CLIPVisionConfig,
+        *,
+        image_feature_size_override: Optional[int] = None,
+    ):
+        if image_feature_size_override is None:
+            image_feature_size = _get_clip_image_feature_size(hf_config)
+        else:
+            image_feature_size = image_feature_size_override
+
+        values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
+                             dtype=torch.float16)
+        return ImageFeatureData(values)
 
     @classmethod
     def _dummy_data_for_llava(
@@ -155,14 +156,24 @@ def _dummy_data_for_llava(
         vision_config = hf_config.vision_config
 
         if isinstance(vision_config, CLIPVisionConfig):
-            return cls._dummy_data_for_clip(
-                model_config,
-                multimodal_config,
+            seq_data = cls._dummy_seq_data_for_clip(
                 vision_config,
-                seq_len=seq_len,
+                seq_len,
                 image_token_id=hf_config.image_token_index,
             )
 
+            image_input_type = multimodal_config.image_input_type
+            ImageInputType = VisionLanguageConfig.ImageInputType
+            multi_modal_data: MultiModalData
+            if image_input_type == ImageInputType.PIXEL_VALUES:
+                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                    vision_config)
+            elif image_input_type == ImageInputType.IMAGE_FEATURES:
+                multi_modal_data = cls._dummy_feature_data_for_clip(
+                    vision_config)
+
+            return seq_data, multi_modal_data
+
         msg = f"Unsupported vision config: {type(vision_config)}"
         raise NotImplementedError(msg)
 
@@ -175,18 +186,37 @@ def _dummy_data_for_llava_next(
         seq_len: int,
     ):
         vision_config = hf_config.vision_config
-        image_feature_size = _get_llava_next_image_feature_size(hf_config)
+
+        # Result in the max possible feature size
+        dummy_height = dummy_width = 448
+        image_feature_size = _get_llava_next_image_feature_size(
+            hf_config, input_height=dummy_height, input_width=dummy_width)
 
         if isinstance(vision_config, CLIPVisionConfig):
-            return cls._dummy_data_for_clip(
-                model_config,
-                multimodal_config,
+            seq_data = cls._dummy_seq_data_for_clip(
                 vision_config,
-                seq_len=seq_len,
+                seq_len,
                 image_token_id=hf_config.image_token_index,
                 image_feature_size_override=image_feature_size,
             )
 
+            image_input_type = multimodal_config.image_input_type
+            ImageInputType = VisionLanguageConfig.ImageInputType
+            multi_modal_data: MultiModalData
+            if image_input_type == ImageInputType.PIXEL_VALUES:
+                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                    vision_config,
+                    image_width_override=dummy_width,
+                    image_height_override=dummy_height,
+                )
+            elif image_input_type == ImageInputType.IMAGE_FEATURES:
+                multi_modal_data = cls._dummy_feature_data_for_clip(
+                    vision_config,
+                    image_feature_size_override=image_feature_size,
+                )
+
+            return seq_data, multi_modal_data
+
         msg = f"Unsupported vision config: {type(vision_config)}"
         raise NotImplementedError(msg)
 

From 9cfbcce3190dd3544853f13cf63392137a6289f0 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 13:05:58 +0000
Subject: [PATCH 017/181] Fix LLaVA-NeXT input processor and cleanup code

---
 vllm/multimodal/image.py | 217 ++++++++++++++++++++++++++-------------
 1 file changed, 145 insertions(+), 72 deletions(-)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index ca592c7ce878..0da0ae042519 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -5,7 +5,7 @@
 import torch
 from PIL import Image
 from transformers import (CLIPVisionConfig, LlavaConfig, LlavaNextConfig,
-                          PretrainedConfig)
+                          PretrainedConfig, PreTrainedTokenizerBase)
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape)
 
@@ -67,17 +67,20 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
-def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
+def _get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
         num_patches = _get_clip_num_patches(vision_config)
         base_feature_size = num_patches * num_patches
 
-        # Results in the max possible feature size
-        dummy_height, dummy_width = 448, 448
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-            image_size=(dummy_height, dummy_width),
+            image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
         )
@@ -85,7 +88,7 @@ def _get_llava_next_image_feature_size(hf_config: LlavaNextConfig) -> int:
         (
             unpadded_feature_size,
             newline_feature_size,
-        ) = _get_llava_next_num_unpadded_features(dummy_height, dummy_width,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
                                                   num_patches,
                                                   num_patch_height,
                                                   num_patch_width)
@@ -105,10 +108,8 @@ class DummyImageDataFactories:
     """
 
     @classmethod
-    def _dummy_data_for_clip(
+    def _dummy_seq_data_for_clip(
         cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
         hf_config: CLIPVisionConfig,
         seq_len: int,
         *,
@@ -122,22 +123,40 @@ def _dummy_data_for_clip(
 
         token_ids = [image_token_id] * image_feature_size
         token_ids += [0] * (seq_len - image_feature_size)
-        seq_data = SequenceData(token_ids)
-
-        image_input_type = multimodal_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-        multi_modal_data: MultiModalData
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            width = height = hf_config.image_size
-            image = Image.new("RGB", (width, height), color=0)
-            multi_modal_data = ImagePixelData(image)
-        elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            depth = hf_config.hidden_size
-            values = torch.zeros((1, image_feature_size, depth),
-                                 dtype=torch.float16)
-            multi_modal_data = ImageFeatureData(values)
-
-        return seq_data, multi_modal_data
+        return SequenceData(token_ids)
+
+    @classmethod
+    def _dummy_pixel_data_for_clip(
+        cls,
+        hf_config: CLIPVisionConfig,
+        *,
+        image_width_override: Optional[int] = None,
+        image_height_override: Optional[int] = None,
+    ):
+        width = height = hf_config.image_size
+        if image_width_override is not None:
+            width = image_width_override
+        if image_height_override is not None:
+            height = image_height_override
+
+        image = Image.new("RGB", (width, height), color=0)
+        return ImagePixelData(image)
+
+    @classmethod
+    def _dummy_feature_data_for_clip(
+        cls,
+        hf_config: CLIPVisionConfig,
+        *,
+        image_feature_size_override: Optional[int] = None,
+    ):
+        if image_feature_size_override is None:
+            image_feature_size = _get_clip_image_feature_size(hf_config)
+        else:
+            image_feature_size = image_feature_size_override
+
+        values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
+                             dtype=torch.float16)
+        return ImageFeatureData(values)
 
     @classmethod
     def _dummy_data_for_llava(
@@ -150,14 +169,24 @@ def _dummy_data_for_llava(
         vision_config = hf_config.vision_config
 
         if isinstance(vision_config, CLIPVisionConfig):
-            return cls._dummy_data_for_clip(
-                model_config,
-                multimodal_config,
+            seq_data = cls._dummy_seq_data_for_clip(
                 vision_config,
-                seq_len=seq_len,
+                seq_len,
                 image_token_id=hf_config.image_token_index,
             )
 
+            image_input_type = multimodal_config.image_input_type
+            ImageInputType = VisionLanguageConfig.ImageInputType
+            multi_modal_data: MultiModalData
+            if image_input_type == ImageInputType.PIXEL_VALUES:
+                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                    vision_config)
+            elif image_input_type == ImageInputType.IMAGE_FEATURES:
+                multi_modal_data = cls._dummy_feature_data_for_clip(
+                    vision_config)
+
+            return seq_data, multi_modal_data
+
         msg = f"Unsupported vision config: {type(vision_config)}"
         raise NotImplementedError(msg)
 
@@ -170,18 +199,37 @@ def _dummy_data_for_llava_next(
         seq_len: int,
     ):
         vision_config = hf_config.vision_config
-        image_feature_size = _get_llava_next_image_feature_size(hf_config)
+
+        # Result in the max possible feature size
+        dummy_height = dummy_width = 448
+        image_feature_size = _get_llava_next_image_feature_size(
+            hf_config, input_height=dummy_height, input_width=dummy_width)
 
         if isinstance(vision_config, CLIPVisionConfig):
-            return cls._dummy_data_for_clip(
-                model_config,
-                multimodal_config,
+            seq_data = cls._dummy_seq_data_for_clip(
                 vision_config,
-                seq_len=seq_len,
+                seq_len,
                 image_token_id=hf_config.image_token_index,
                 image_feature_size_override=image_feature_size,
             )
 
+            image_input_type = multimodal_config.image_input_type
+            ImageInputType = VisionLanguageConfig.ImageInputType
+            multi_modal_data: MultiModalData
+            if image_input_type == ImageInputType.PIXEL_VALUES:
+                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                    vision_config,
+                    image_width_override=dummy_width,
+                    image_height_override=dummy_height,
+                )
+            elif image_input_type == ImageInputType.IMAGE_FEATURES:
+                multi_modal_data = cls._dummy_feature_data_for_clip(
+                    vision_config,
+                    image_feature_size_override=image_feature_size,
+                )
+
+            return seq_data, multi_modal_data
+
         msg = f"Unsupported vision config: {type(vision_config)}"
         raise NotImplementedError(msg)
 
@@ -244,52 +292,45 @@ def _repeat_and_pad_token(
     @classmethod
     def _repeat_and_pad_image_tokens(
         cls,
-        model_config: ModelConfig,
-        llm_inputs: LLMInputs,
+        tokenizer: PreTrainedTokenizerBase,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
         *,
         image_token_id: int,
         repeat_count: int = 1,
         pad_token_left: Optional[int] = None,
         pad_token_right: Optional[int] = None,
-    ) -> LLMInputs:
-        multi_modal_data = llm_inputs.get("multi_modal_data")
-        if multi_modal_data is None:
-            return llm_inputs
-
-        tokenizer = _cached_get_tokenizer(model_config.tokenizer)
-        image_token_str = tokenizer.decode(image_token_id)
-        pad_token_str_left = (None if pad_token_left is None else
-                              tokenizer.decode(pad_token_left))
-        pad_token_str_right = (None if pad_token_right is None else
-                               tokenizer.decode(pad_token_right))
-
-        replacement_str = "".join(
-            cls._repeat_and_pad_token(
-                image_token_str,
-                repeat_count=repeat_count,
-                pad_token_left=pad_token_str_left,
-                pad_token_right=pad_token_str_right,
-            ))
-        replacement_ids = cls._repeat_and_pad_token(
-            image_token_id,
-            repeat_count=repeat_count,
-            pad_token_left=pad_token_left,
-            pad_token_right=pad_token_right,
-        )
-
+    ) -> Tuple[Optional[str], List[int]]:
         # To avoid invoking the tokenizer, we assume that the
         # image token is called "<image>"
-        prompt = llm_inputs.get("prompt")
         if prompt is None:
             new_prompt = None
         else:
+            image_token_str = tokenizer.decode(image_token_id)
+            pad_token_str_left = (None if pad_token_left is None else
+                                  tokenizer.decode(pad_token_left))
+            pad_token_str_right = (None if pad_token_right is None else
+                                   tokenizer.decode(pad_token_right))
+            replacement_str = "".join(
+                cls._repeat_and_pad_token(
+                    image_token_str,
+                    repeat_count=repeat_count,
+                    pad_token_left=pad_token_str_left,
+                    pad_token_right=pad_token_str_right,
+                ))
+
             # The image tokens are removed to be consistent with HuggingFace
             new_prompt = prompt.replace(image_token_str, replacement_str, 1)
 
-        prompt_token_ids = llm_inputs["prompt_token_ids"]
         new_token_ids: List[int] = []
         for i, token in enumerate(prompt_token_ids):
             if token == image_token_id:
+                replacement_ids = cls._repeat_and_pad_token(
+                    image_token_id,
+                    repeat_count=repeat_count,
+                    pad_token_left=pad_token_left,
+                    pad_token_right=pad_token_right,
+                )
                 new_token_ids.extend(replacement_ids)
 
                 # No need to further scan the list since we only replace once
@@ -298,10 +339,7 @@ def _repeat_and_pad_image_tokens(
             else:
                 new_token_ids.append(token)
 
-        # NOTE: Create a defensive copy of the original inputs
-        return LLMInputs(prompt_token_ids=new_token_ids,
-                         prompt=new_prompt,
-                         multi_modal_data=multi_modal_data)
+        return new_prompt, new_token_ids
 
     @classmethod
     def _input_processor_for_clip(
@@ -314,18 +352,31 @@ def _input_processor_for_clip(
         image_token_id: int,
         image_feature_size_override: Optional[int] = None,
     ):
+        multi_modal_data = llm_inputs.get("multi_modal_data")
+        if multi_modal_data is None or not isinstance(
+                multi_modal_data, (ImagePixelData, ImageFeatureData)):
+            return llm_inputs
+
+        tokenizer = _cached_get_tokenizer(model_config.tokenizer)
+
         if image_feature_size_override is None:
             image_feature_size = _get_clip_image_feature_size(hf_config)
         else:
             image_feature_size = image_feature_size_override
 
-        return cls._repeat_and_pad_image_tokens(
-            model_config,
-            llm_inputs,
+        new_prompt, new_token_ids = cls._repeat_and_pad_image_tokens(
+            tokenizer,
+            llm_inputs.get("prompt"),
+            llm_inputs["prompt_token_ids"],
             image_token_id=image_token_id,
             repeat_count=image_feature_size,
         )
 
+        # NOTE: Create a defensive copy of the original inputs
+        return LLMInputs(prompt_token_ids=new_token_ids,
+                         prompt=new_prompt,
+                         multi_modal_data=multi_modal_data)
+
     @classmethod
     def _input_processor_for_llava(
         cls,
@@ -334,6 +385,11 @@ def _input_processor_for_llava(
         hf_config: LlavaConfig,
         llm_inputs: LLMInputs,
     ):
+        multi_modal_data = llm_inputs.get("multi_modal_data")
+        if multi_modal_data is None or not isinstance(
+                multi_modal_data, (ImagePixelData, ImageFeatureData)):
+            return llm_inputs
+
         vision_config = hf_config.vision_config
 
         if isinstance(vision_config, CLIPVisionConfig):
@@ -356,8 +412,25 @@ def _input_processor_for_llava_next(
         hf_config: LlavaNextConfig,
         llm_inputs: LLMInputs,
     ):
+        multi_modal_data = llm_inputs.get("multi_modal_data")
+        if multi_modal_data is None or not isinstance(
+                multi_modal_data, (ImagePixelData, ImageFeatureData)):
+            return llm_inputs
+
+        if isinstance(multi_modal_data, ImagePixelData):
+            image = multi_modal_data.image
+            if isinstance(image, torch.Tensor):
+                _, _, _, height, width = image.shape
+            else:
+                width, height = image.size
+
+            image_feature_size = _get_llava_next_image_feature_size(
+                hf_config, input_height=height, input_width=width)
+        else:
+            image_features = multi_modal_data.image_features
+            image_feature_size = image_features.shape[-2]
+
         vision_config = hf_config.vision_config
-        image_feature_size = _get_llava_next_image_feature_size(hf_config)
 
         if isinstance(vision_config, CLIPVisionConfig):
             return cls._input_processor_for_clip(

From 7bb6cbf81cc35a566f039566b7eb93457df549d3 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Jun 2024 03:17:24 +0000
Subject: [PATCH 018/181] Add sanity check

---
 vllm/worker/model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 54cba07344c7..cbfba4a89beb 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -797,6 +797,7 @@ def profile_run(self) -> None:
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
                 .dummy_data_for_profiling(model_config, seq_len)
+            assert len(seq_data.prompt_token_ids) == seq_len
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),

From 8e2ff860e58c289f468c05afcdec6333e1ab11a4 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 11 Jun 2024 07:02:54 +0000
Subject: [PATCH 019/181] Update LLaVA-NeXT

---
 vllm/model_executor/models/llava_next.py | 50 +++++++-----------------
 1 file changed, 15 insertions(+), 35 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 57cbd1e4a601..7039762137fb 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -3,7 +3,6 @@
 
 import torch
 import torch.nn as nn
-from PIL import Image
 # TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
 # transformers' impl.
 from transformers import CLIPVisionModel, LlavaNextConfig
@@ -12,7 +11,9 @@
 from typing_extensions import NotRequired
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -22,9 +23,8 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.multimodal.image import DummyImageDataFactories, ImagePixelData
+from vllm.sequence import SamplerOutput
 
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 from .vlm_base import VisionLanguageModelBase
@@ -59,41 +59,19 @@ class LlavaNextImageFeatureInputs(TypedDict):
                              LlavaNextImageFeatureInputs]
 
 
-def _get_dummy_image_data(
-    seq_len: int,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Tuple[SequenceData, MultiModalData]:
-    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
-                                                  vlm_config)
-
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        _, c, h, w = vlm_config.image_input_shape
-        mode = {1: "L", 3: "RGB"}[c]
-        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
-
-    return seq_data, fake_mm_data
-
-
-def _image_pixel_processor(
-    data: ImagePixelData,
-    model_config: ModelConfig,
-    vlm_config: VisionLanguageConfig,
-) -> Dict[str, torch.Tensor]:
+def _image_pixel_processor(ctx: InputContext,
+                           data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
 
     if isinstance(image, torch.Tensor):
-        pixel_values = image.to(model_config.dtype)
+        pixel_values = image.to(ctx.model_config.dtype)
         batch_size, _, _, h, w = pixel_values.shape
         image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
 
         return {"pixel_values": pixel_values, "image_sizes": image_sizes}
 
     # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = vlm_config.image_input_shape
+    _, _, h, w = ctx.get_multimodal_config().image_input_shape
     if (w, h) != (image.width, image.height):
         logger.warning(
             "Dynamic image shape is currently not supported. "
@@ -101,12 +79,14 @@ def _image_pixel_processor(
 
         data.image = image.resize((w, h))
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_processor(data, model_config, vlm_config)
+    return INPUT_REGISTRY.MULTIMODAL._get_plugin_for_data_type(ImagePixelData) \
+        ._default_input_mapper(ctx, data)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
-@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
+@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
+@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper(_image_pixel_processor)
+@INPUT_REGISTRY.register_dummy_data(
+    DummyImageDataFactories.for_model(LlavaNextConfig))
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
     """
     Args to `forward()`:

From b134dfc84443d3d27cefa524f5fd47c5262ae0a7 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 11 Jun 2024 07:08:35 +0000
Subject: [PATCH 020/181] Update name

---
 vllm/model_executor/models/llava_next.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 7039762137fb..04442d17d7c0 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -59,8 +59,8 @@ class LlavaNextImageFeatureInputs(TypedDict):
                              LlavaNextImageFeatureInputs]
 
 
-def _image_pixel_processor(ctx: InputContext,
-                           data: ImagePixelData) -> Dict[str, torch.Tensor]:
+def _pixel_mapper(ctx: InputContext,
+                  data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
 
     if isinstance(image, torch.Tensor):
@@ -84,7 +84,7 @@ def _image_pixel_processor(ctx: InputContext,
 
 
 @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper(_image_pixel_processor)
+@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper(_pixel_mapper)
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaNextConfig))
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):

From 1a08444cb38c4dc6857dfd0b0b9ee62047a1ce59 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 11 Jun 2024 07:13:20 +0000
Subject: [PATCH 021/181] Update LLaVA-NeXT

---
 vllm/model_executor/models/llava_next.py | 34 ++++--------------------
 vllm/multimodal/image.py                 |  2 ++
 2 files changed, 7 insertions(+), 29 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 04442d17d7c0..c1ffd6082d36 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,5 +1,4 @@
-from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
-                    Union)
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
 import torch.nn as nn
@@ -13,7 +12,6 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
 from vllm.inputs import INPUT_REGISTRY
-from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -23,7 +21,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal.image import DummyImageDataFactories, ImagePixelData
+from vllm.multimodal.image import DummyImageDataFactories, ImageInputProcessors
 from vllm.sequence import SamplerOutput
 
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
@@ -59,34 +57,12 @@ class LlavaNextImageFeatureInputs(TypedDict):
                              LlavaNextImageFeatureInputs]
 
 
-def _pixel_mapper(ctx: InputContext,
-                  data: ImagePixelData) -> Dict[str, torch.Tensor]:
-    image = data.image
-
-    if isinstance(image, torch.Tensor):
-        pixel_values = image.to(ctx.model_config.dtype)
-        batch_size, _, _, h, w = pixel_values.shape
-        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
-
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
-
-    # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = ctx.get_multimodal_config().image_input_shape
-    if (w, h) != (image.width, image.height):
-        logger.warning(
-            "Dynamic image shape is currently not supported. "
-            "Resizing input image to (%d, %d).", w, h)
-
-        data.image = image.resize((w, h))
-
-    return INPUT_REGISTRY.MULTIMODAL._get_plugin_for_data_type(ImagePixelData) \
-        ._default_input_mapper(ctx, data)
-
-
 @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper(_pixel_mapper)
+@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaNextConfig))
+@INPUT_REGISTRY.register_input_processor(
+    ImageInputProcessors.for_model(LlavaNextConfig))
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
     """
     Args to `forward()`:
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index c6a9da0d9e45..f7713cfb7d43 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -14,6 +14,7 @@
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
+from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from .base import MultiModalData, MultiModalPlugin
 
@@ -25,6 +26,7 @@
 logger = init_logger(__name__)
 
 _cached_get_image_processor = lru_cache(get_image_processor)
+_cached_get_tokenizer = lru_cache(get_tokenizer)
 
 
 def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:

From 3fb622c548fac8f1d73d99010cce3f3634d3dccd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 11 Jun 2024 15:04:05 +0000
Subject: [PATCH 022/181] Remove `MULTIMODAL` convenience property as it was
 causing some (import-related?) problems with ray workers

---
 vllm/inputs/__init__.py                  |  6 +-----
 vllm/inputs/registry.py                  | 11 ++---------
 vllm/model_executor/models/llava.py      |  5 +++--
 vllm/model_executor/models/llava_next.py |  7 ++++---
 vllm/worker/cpu_model_runner.py          |  4 ++--
 vllm/worker/model_runner.py              |  3 ++-
 6 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 6288503bfe19..16206f022b59 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,15 +1,11 @@
-from vllm.multimodal import MULTIMODAL_REGISTRY
-
 from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
                    PromptStrictInputs, TextPrompt, TextTokensPrompt,
                    TokensPrompt, parse_and_batch_prompt)
 from .registry import InputRegistry
 
-INPUT_REGISTRY = InputRegistry(multimodal_registry=MULTIMODAL_REGISTRY)
+INPUT_REGISTRY = InputRegistry()
 """The global :class:`~InputRegistry` which is used by model runners."""
 
-del MULTIMODAL_REGISTRY
-
 __all__ = [
     "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
     "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs",
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 3bdfa1122b3a..0a90b911a3c2 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VisionLanguageConfig
-    from vllm.multimodal import MultiModalData, MultiModalRegistry
+    from vllm.multimodal import MultiModalData
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -62,19 +62,12 @@ class InputRegistry:
     according to its modality and the target model.
     """
 
-    def __init__(self, *, multimodal_registry: "MultiModalRegistry") -> None:
-        self._multimodal_registry = multimodal_registry
-
+    def __init__(self) -> None:
         self._dummy_factories_by_model_type: Dict[Type[nn.Module],
                                                   DummyDataFactory] = {}
         self._input_processors_by_model_type: Dict[Type[nn.Module],
                                                    InputProcessor] = {}
 
-    @property
-    def MULTIMODAL(self) -> "MultiModalRegistry":
-        """Access the registry for processing multimodal inputs."""
-        return self._multimodal_registry
-
     def _default_dummy_data_factory(
         self,
         ctx: InputContext,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index fd8e67e17576..8a576c801d84 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,6 +18,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import DummyImageDataFactories
 from vllm.sequence import SamplerOutput
 
@@ -84,8 +85,8 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
-@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaConfig))
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 04442d17d7c0..17ace179124b 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -23,6 +23,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import DummyImageDataFactories, ImagePixelData
 from vllm.sequence import SamplerOutput
 
@@ -79,12 +80,12 @@ def _pixel_mapper(ctx: InputContext,
 
         data.image = image.resize((w, h))
 
-    return INPUT_REGISTRY.MULTIMODAL._get_plugin_for_data_type(ImagePixelData) \
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
         ._default_input_mapper(ctx, data)
 
 
-@INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-@INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper(_pixel_mapper)
+@MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_pixel_mapper)
 @INPUT_REGISTRY.register_dummy_data(
     DummyImageDataFactories.for_model(LlavaNextConfig))
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 95d8e44f5111..5b02d1a83758 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -9,10 +9,10 @@
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
-from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 
@@ -66,7 +66,7 @@ def __init__(
         )
 
         # Create processor for multi-modal data
-        self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 
         # Lazy initialization.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 73e03e0022ef..ee531b500c95 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -21,6 +21,7 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
@@ -126,7 +127,7 @@ def __init__(
         )
 
         # Create processor for multi-modal data
-        self.multi_modal_input_mapper = INPUT_REGISTRY.MULTIMODAL \
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 
         # Lazy initialization

From 383bea194fe835906615a1146bd9c1dc0313b80a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 11 Jun 2024 15:07:37 +0000
Subject: [PATCH 023/181] Update docs

---
 vllm/inputs/__init__.py     | 5 ++++-
 vllm/inputs/registry.py     | 4 ++--
 vllm/multimodal/__init__.py | 5 ++++-
 vllm/multimodal/registry.py | 2 +-
 4 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 16206f022b59..d251d1ac72f8 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -4,7 +4,10 @@
 from .registry import InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
-"""The global :class:`~InputRegistry` which is used by model runners."""
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+"""
 
 __all__ = [
     "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 7c14916bd803..be3be81cba72 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -58,8 +58,8 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
 
 class InputRegistry:
     """
-    This registry is used by :class:`~vllm.LLMEngine` to dispatch data
-    processing according to the target model.
+    A registry to dispatch data processing
+    according to the target model.
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index c97586258c90..dfd47f476d37 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -2,7 +2,10 @@
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to its modality and the target model.
+"""
 
 __all__ = [
     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 758bf43ca8fd..abc88e4f9a9d 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -18,7 +18,7 @@
 
 class MultiModalRegistry:
     """
-    This registry is used by model runners to dispatch data processing
+    A registry to dispatch data processing
     according to its modality and the target model.
     """
 

From 80a09f2bb915e64b5758578250eafd3e013c8044 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 12 Jun 2024 01:04:31 +0000
Subject: [PATCH 024/181] Remove double processing of image tokens

---
 vllm/config.py                            | 12 +------
 vllm/entrypoints/openai/serving_chat.py   | 39 +++--------------------
 vllm/entrypoints/openai/serving_engine.py |  1 +
 vllm/multimodal/utils.py                  | 16 ----------
 4 files changed, 6 insertions(+), 62 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index e77bddea9e72..a20a31ea3214 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -5,7 +5,7 @@
                     Union)
 
 import torch
-from transformers import PretrainedConfig, PreTrainedTokenizerBase
+from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -1118,16 +1118,6 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType:
                              f"Expecting to choose from "
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
-    #TODO(ywang96): make this a cached property once we refactor the
-    # VisionLanguageConfig class.
-    def get_image_token_text(
-            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
-        """Get the image token placeholder text to be inserted into the 
-        text prompt and the string representation of the image token id.
-        """
-        image_token_str = tokenizer.decode(self.image_token_id)
-        return image_token_str * self.image_feature_size, image_token_str
-
     def as_cli_args_dict(self) -> Dict[str, Any]:
         """Flatten vision language config to pure args.
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7cd434fe0d27..d6d690d7e61a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -10,7 +10,7 @@
 from openai.types.chat import (ChatCompletionContentPartImageParam,
                                ChatCompletionContentPartTextParam)
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionContentPartParam, ChatCompletionLogProb,
@@ -27,8 +27,7 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import (async_get_and_parse_image,
-                                   get_full_image_text_prompt)
+from vllm.multimodal.utils import async_get_and_parse_image
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.utils import random_uuid
@@ -103,10 +102,6 @@ def _parse_chat_message_content_parts(
         texts: List[str] = []
         image_futures: List[Awaitable[ImagePixelData]] = []
 
-        vlm_config: Optional[VisionLanguageConfig] = getattr(
-            self.engine.engine, "vision_language_config", None)
-        model_config = getattr(self.engine.engine, "model_config", None)
-
         for part in parts:
             part_type = part["type"]
             if part_type == "text":
@@ -114,7 +109,7 @@ def _parse_chat_message_content_parts(
 
                 texts.append(text)
             elif part_type == "image_url":
-                if vlm_config is None:
+                if self.model_config.multimodal_config is None:
                     raise ValueError(
                         "'image_url' input is not supported as the loaded "
                         "model is not multimodal.")
@@ -141,33 +136,7 @@ def _parse_chat_message_content_parts(
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
         text_prompt = "\n".join(texts)
-
-        if vlm_config is not None and len(image_futures):
-
-            (image_token_prompt,
-             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
-
-            # NOTE: If image token string (e.g, <image>) is already present
-            # in the text prompt, we assume it follows the same format required
-            # by the engine.
-            if image_token_str in text_prompt:
-                logger.warning(
-                    "Detected image token string in the text prompt. "
-                    "Skipping prompt formatting.")
-                messages = [
-                    ConversationMessage(role=role, content=text_prompt)
-                ]
-
-            else:
-                full_prompt = get_full_image_text_prompt(
-                    image_prompt=image_token_prompt,
-                    text_prompt=text_prompt,
-                    config=model_config)
-                messages = [
-                    ConversationMessage(role=role, content=full_prompt)
-                ]
-        else:
-            messages = [ConversationMessage(role=role, content=text_prompt)]
+        messages = [ConversationMessage(role=role, content=text_prompt)]
 
         return ChatMessageParseResult(messages=messages,
                                       image_futures=image_futures)
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 6b5a62efc7f2..99d64f6146f8 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -35,6 +35,7 @@ def __init__(self, engine: AsyncLLMEngine, model_config: ModelConfig,
         super().__init__()
 
         self.engine = engine
+        self.model_config = model_config
         self.max_model_len = model_config.max_model_len
 
         # A separate tokenizer to map token IDs to strings.
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c6311d60e0bd..55157e4b377f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -5,7 +5,6 @@
 import aiohttp
 from PIL import Image
 
-from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.multimodal.image import ImagePixelData
 
@@ -68,18 +67,3 @@ def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     """Load image from base64 format."""
     return Image.open(BytesIO(base64.b64decode(image)))
-
-
-# TODO(ywang96): move this to a model registry for preprocessing vision
-# language prompts based on the model type.
-def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
-                               config: ModelConfig) -> str:
-    """Combine image and text prompts for vision language model depending on
-    the model architecture."""
-
-    if config.hf_config.model_type in ("llava", "llava_next"):
-        full_prompt = f"{image_prompt}\n{text_prompt}"
-    else:
-        raise ValueError(
-            f"Unsupported model type: {config.hf_config.model_type}")
-    return full_prompt

From 6a70e4f55497ac009754edb5a28801654628051f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 12 Jun 2024 02:12:33 +0000
Subject: [PATCH 025/181] Add docs

---
 .../input_processing/model_inputs_index.rst   | 30 +++++++++++++++++++
 docs/source/index.rst                         |  1 +
 docs/source/models/adding_model.rst           |  4 +--
 vllm/inputs/__init__.py                       |  5 +++-
 vllm/inputs/data.py                           |  5 ++++
 vllm/inputs/registry.py                       | 26 ++++++++++++++--
 vllm/multimodal/__init__.py                   |  5 +++-
 vllm/multimodal/base.py                       |  4 +--
 vllm/multimodal/image.py                      | 11 +++++--
 vllm/multimodal/registry.py                   |  2 +-
 10 files changed, 81 insertions(+), 12 deletions(-)
 create mode 100644 docs/source/dev/input_processing/model_inputs_index.rst

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
new file mode 100644
index 000000000000..f1f929c92332
--- /dev/null
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -0,0 +1,30 @@
+Input Processing
+================
+
+.. currentmodule:: vllm.inputs
+
+vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
+in :class:`~vllm.LLMEngine` before they are passed to model executors.
+
+.. contents::
+   :local:
+   :backlinks: none
+
+Module Contents
++++++++++++++++
+
+LLM Engine Inputs
+-----------------
+
+.. autoclass:: vllm.inputs.LLMInputs
+    :members:
+    :show-inheritance:
+
+Registry
+--------
+
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
+.. automodule:: vllm.inputs.registry
+    :members:
+    :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 0ff0ea1da1ca..bf21a9e70335 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -109,6 +109,7 @@ Documentation
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/input_processing/model_inputs_index
    dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
diff --git a/docs/source/models/adding_model.rst b/docs/source/models/adding_model.rst
index cbc8099e6f70..f282b594590b 100644
--- a/docs/source/models/adding_model.rst
+++ b/docs/source/models/adding_model.rst
@@ -37,7 +37,7 @@ For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/
 2. Rewrite the :code:`forward` methods
 --------------------------------------
 
-Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
+Next, you need to rewrite the :meth:`~torch.nn.Module.forward` method of your model by following these steps:
 
 1. Remove any unnecessary code, such as the code only used for training.
 2. Change the input parameters:
@@ -75,7 +75,7 @@ Next, you need to rewrite the :code:`forward` methods of your model by following
 
 If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
 To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
-For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
+For the embedding layer, you can simply replace :class:`torch.nn.Embedding` with :code:`VocabParallelEmbedding`. For the output LM head, you can use :code:`ParallelLMHead`.
 When it comes to the linear layers, we provide the following options to parallelize them:
 
 * :code:`ReplicatedLinear`: Replicates the inputs and weights across multiple GPUs. No memory saving.
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 16206f022b59..d251d1ac72f8 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -4,7 +4,10 @@
 from .registry import InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
-"""The global :class:`~InputRegistry` which is used by model runners."""
+"""
+The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
+to dispatch data processing according to the target model.
+"""
 
 __all__ = [
     "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 85c9cd84f5ed..2c600e9793f3 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -125,6 +125,11 @@ class TextTokensPrompt(TypedDict):
 
 
 class LLMInputs(TypedDict):
+    """
+    The inputs in :class:`~vllm.LLMEngine` before they are
+    passed to the model executor.
+    """
+
     prompt_token_ids: List[int]
     prompt: NotRequired[Optional[str]]
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 0a90b911a3c2..1abc51d1e308 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -22,9 +22,22 @@
 
 @dataclass(frozen=True)
 class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
     model_config: "ModelConfig"
+    """The configuration of the model."""
 
     def get_multimodal_config(self) -> "VisionLanguageConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+
         multimodal_config = self.model_config.multimodal_config
         if multimodal_config is None:
             raise ValueError("No multimodal config found")
@@ -32,6 +45,15 @@ def get_multimodal_config(self) -> "VisionLanguageConfig":
         return multimodal_config
 
     def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            ValueError: If the model is not of the specified type.
+        """
+
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "
@@ -58,8 +80,8 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
 
 class InputRegistry:
     """
-    This registry is used by model runners to dispatch data processing
-    according to its modality and the target model.
+    A registry to dispatch data processing
+    according to the target model.
     """
 
     def __init__(self) -> None:
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index c97586258c90..dfd47f476d37 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -2,7 +2,10 @@
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
-"""The global :class:`~MultiModalRegistry` which is used by model runners."""
+"""
+The global :class:`~MultiModalRegistry` is used by model runners to
+dispatch data processing according to its modality and the target model.
+"""
 
 __all__ = [
     "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 94ee43d118d3..6b4684a54d33 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -35,7 +35,7 @@ class MultiModalData:
 
 MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
-:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+:meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
@@ -66,7 +66,7 @@ def get_data_type(self) -> Type[D]:
     def _default_input_mapper(self, ctx: InputContext,
                               data: D) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
-        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
         """
         raise NotImplementedError
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 022c40656b73..c451d89ee952 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -92,7 +92,12 @@ def _get_llava_next_image_feature_size(
 
 
 class DummyImageDataFactories:
-    """Contains factories for dummy image data factories."""
+    """
+    Contains factories for dummy image data factories.
+
+    See Also:
+        :data:`vllm.inputs.registry.DummyDataFactory`
+    """
 
     @classmethod
     def _dummy_seq_data_for_clip(
@@ -252,9 +257,9 @@ class ImagePixelData(MultiModalData):
     """
     The pixel data of an image. Can be one of:
 
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+    - :class:`PIL.Image.Image`: An image object. Requires that a HuggingFace
       processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+    - :class:`torch.Tensor`: The raw pixel data which is passed to the model
       without additional pre-processing.
     """
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 758bf43ca8fd..abc88e4f9a9d 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -18,7 +18,7 @@
 
 class MultiModalRegistry:
     """
-    This registry is used by model runners to dispatch data processing
+    A registry to dispatch data processing
     according to its modality and the target model.
     """
 

From 8322ecb08cb5193687dd140d02b0fcd59e054444 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 12 Jun 2024 02:13:34 +0000
Subject: [PATCH 026/181] Add docs

---
 .../input_processing/model_inputs_index.rst   |  2 ++
 vllm/inputs/registry.py                       | 22 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index 8529fc1d9af4..f1f929c92332 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -23,6 +23,8 @@ LLM Engine Inputs
 Registry
 --------
 
+.. autodata:: vllm.inputs.INPUT_REGISTRY
+
 .. automodule:: vllm.inputs.registry
     :members:
     :show-inheritance:
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index be3be81cba72..1abc51d1e308 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -22,9 +22,22 @@
 
 @dataclass(frozen=True)
 class InputContext:
+    """
+    Contains information about the model which may be used to
+    modify the inputs.
+    """
+
     model_config: "ModelConfig"
+    """The configuration of the model."""
 
     def get_multimodal_config(self) -> "VisionLanguageConfig":
+        """
+        Get the multimodal configuration of the model.
+
+        Raises:
+            ValueError: If the model is not multimodal.
+        """
+
         multimodal_config = self.model_config.multimodal_config
         if multimodal_config is None:
             raise ValueError("No multimodal config found")
@@ -32,6 +45,15 @@ def get_multimodal_config(self) -> "VisionLanguageConfig":
         return multimodal_config
 
     def get_hf_config(self, hf_config_type: Type[C]) -> C:
+        """
+        Get the HuggingFace configuration
+        (:class:`transformers.PretrainedConfig`) of the model,
+        additionally checking its type.
+
+        Raises:
+            ValueError: If the model is not of the specified type.
+        """
+
         hf_config = self.model_config.hf_config
         if not isinstance(hf_config, hf_config_type):
             raise TypeError("Invalid type of HuggingFace config. "

From 52a01169407ae7185bd901a1c6358aa2ab00a346 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 12 Jun 2024 02:22:48 +0000
Subject: [PATCH 027/181] Add docs

---
 vllm/inputs/data.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 2c600e9793f3..8cb5e7ec5d6a 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -101,8 +101,7 @@ class TextTokensPrompt(TypedDict):
     """The prompt text."""
 
     prompt_token_ids: List[int]
-    """The token IDs of the prompt. If None, we use the
-    tokenizer to convert the prompts to token IDs."""
+    """The token IDs of the prompt."""
 
     multi_modal_data: NotRequired["MultiModalData"]
     """
@@ -131,5 +130,15 @@ class LLMInputs(TypedDict):
     """
 
     prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
     prompt: NotRequired[Optional[str]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """

From c1733dd555a57975e68d2c076d44e31eb6de1ef5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 12 Jun 2024 02:22:48 +0000
Subject: [PATCH 028/181] Add docs

---
 vllm/inputs/data.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 2c600e9793f3..8cb5e7ec5d6a 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -101,8 +101,7 @@ class TextTokensPrompt(TypedDict):
     """The prompt text."""
 
     prompt_token_ids: List[int]
-    """The token IDs of the prompt. If None, we use the
-    tokenizer to convert the prompts to token IDs."""
+    """The token IDs of the prompt."""
 
     multi_modal_data: NotRequired["MultiModalData"]
     """
@@ -131,5 +130,15 @@ class LLMInputs(TypedDict):
     """
 
     prompt_token_ids: List[int]
+    """The token IDs of the prompt."""
+
     prompt: NotRequired[Optional[str]]
+    """
+    The original prompt text corresponding to the token IDs, if available.
+    """
+
     multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    """
+    Optional multi-modal data to pass to the model,
+    if the model supports it.
+    """

From 9fb5e7256f7236768964faa539059cc49a91f894 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 09:33:47 +0000
Subject: [PATCH 029/181] Remove more instances of double processing; update
 docs

---
 docs/source/models/vlm.rst      | 6 ++----
 examples/llava_example.py       | 6 ++----
 tests/models/test_llava.py      | 7 +------
 tests/models/test_llava_next.py | 7 +------
 vllm/multimodal/image.py        | 2 --
 5 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 5ab4157cb358..96acd7142052 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -20,7 +20,6 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
     Currently, the support for vision language models on vLLM has the following limitations:
 
     * Only single image input is supported per text prompt.
-    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the HuggingFace implementation.
 
     We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
 
@@ -41,13 +40,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
-* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``prompt``: The prompt is expected to have a single ``<image>`` token per image.
 * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
 
 .. code-block:: python
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
     image = ...
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 980d7bf9f8a3..7d6433c91b19 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -22,8 +22,7 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         disable_image_processor=disable_image_processor,
     )
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
 
     if disable_image_processor:
         image = torch.load("images/stop_sign_pixel_values.pt")
@@ -49,8 +48,7 @@ def run_llava_image_features():
         image_feature_size=576,
     )
 
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
+    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
 
     image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
 
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a1f0cff1cc0e..de43fa020c74 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -87,16 +87,11 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                                               max_tokens,
                                               images=hf_images)
 
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
-
     with vllm_runner(model_id,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
                                                   max_tokens,
                                                   images=vllm_images)
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index aa6ee268ae58..64265f1a3cd1 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -96,11 +96,6 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                                               max_tokens,
                                               images=hf_images)
 
-    vllm_image_prompts = [
-        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
-        for p in HF_IMAGE_PROMPTS
-    ]
-
     with vllm_runner(
             model_id,
             dtype=dtype,
@@ -109,7 +104,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             enforce_eager=True,
             **vlm_config.as_cli_args_dict(),
     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
                                                   max_tokens,
                                                   images=vllm_images)
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index f7713cfb7d43..148076180862 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -301,8 +301,6 @@ def _repeat_and_pad_image_tokens(
         pad_token_left: Optional[int] = None,
         pad_token_right: Optional[int] = None,
     ) -> Tuple[Optional[str], List[int]]:
-        # To avoid invoking the tokenizer, we assume that the
-        # image token is called "<image>"
         if prompt is None:
             new_prompt = None
         else:

From 3932b3f4289c0df89b2044edbb56df07ec20ae31 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 13:33:24 +0000
Subject: [PATCH 030/181] Remove xfail

---
 tests/models/test_llava_next.py |  3 ---
 tests/multimodal/test_mapper.py | 26 +++++++++++---------------
 2 files changed, 11 insertions(+), 18 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 64265f1a3cd1..cd42eae4933c 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -72,9 +72,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_input_ids, hf_output_str
 
 
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 252f6088ebe6..0defe2b9a0f4 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -54,12 +54,9 @@ def test_clip_image_processor(hf_images, dtype):
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
 
 
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_llava_next_image_processor(hf_images, dtype):
-    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
+    MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
@@ -73,14 +70,14 @@ def test_llava_next_image_processor(hf_images, dtype):
         seed=0,
         dtype=dtype,
         revision=None,
-    )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=64000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=2928,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
+        multimodal_config=VisionLanguageConfig(
+            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+            image_token_id=64000,
+            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+            image_feature_size=2928,
+            image_processor=MODEL_NAME,
+            image_processor_revision=None,
+        ),
     )
 
     for image in hf_images:
@@ -88,10 +85,9 @@ def test_llava_next_image_processor(hf_images, dtype):
             image,
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
-        vllm_result = MULTIMODAL_REGISTRY.process_input(
+        vllm_result = MULTIMODAL_REGISTRY.map_input(
+            model_config,
             ImagePixelData(image),
-            model_config=model_config,
-            vlm_config=vlm_config,
         )
 
         assert hf_result.keys() == vllm_result.keys()

From 7fa877a08f1565f24ba3c871c9a03974c35e2889 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 13 Jun 2024 13:42:47 +0000
Subject: [PATCH 031/181] Fix missing image token in OpenAI API serving

---
 vllm/entrypoints/openai/serving_chat.py | 49 +++++++++++++++----------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index d6d690d7e61a..f49e69e02b3a 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,6 +1,7 @@
 import codecs
 import time
 from dataclasses import dataclass, field
+from functools import cached_property
 from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
                     List, Optional)
 from typing import Sequence as GenericSequence
@@ -94,6 +95,22 @@ def _load_chat_template(self, chat_template: Optional[str]):
             logger.warning(
                 "No chat template provided. Chat API will not work.")
 
+    @cached_property
+    def image_token_str(self) -> str:
+        vlm_config = self.model_config.multimodal_config
+        if vlm_config is None:
+            raise ValueError(
+                "'image_url' input is not supported as the loaded "
+                "model is not multimodal.")
+
+        image_token_id = vlm_config.image_token_id
+        if vlm_config.image_token_id is None:
+            raise ValueError(
+                "'image_url' input is not supported as the loaded "
+                "model does not specify an image token.")
+
+        return self.tokenizer.decode(image_token_id)
+
     def _parse_chat_message_content_parts(
         self,
         role: str,
@@ -109,29 +126,23 @@ def _parse_chat_message_content_parts(
 
                 texts.append(text)
             elif part_type == "image_url":
-                if self.model_config.multimodal_config is None:
-                    raise ValueError(
-                        "'image_url' input is not supported as the loaded "
-                        "model is not multimodal.")
-
-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
-
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
-
-                else:
+                if len(image_futures) > 0:
                     raise NotImplementedError(
                         "Multiple 'image_url' input is currently not supported."
                     )
 
+                image_token = self.image_token_str
+                image_url = cast(ChatCompletionContentPartImageParam,
+                                 part)["image_url"]
+
+                if image_url.get("detail", "auto") != "auto":
+                    logger.warning(
+                        "'image_url.detail' is currently not supported and "
+                        "will be ignored.")
+
+                texts.append(image_token)
+                image_future = async_get_and_parse_image(image_url["url"])
+                image_futures.append(image_future)
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 

From 092e550832a9b06d28871cd01ee7d35008e6f5ec Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 14 Jun 2024 02:26:44 +0000
Subject: [PATCH 032/181] Fix LLaVA-NeXT test

---
 tests/models/test_llava_next.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index cd42eae4933c..9c665e51e803 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,3 +1,4 @@
+import re
 from typing import List, Tuple
 
 import pytest
@@ -66,8 +67,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
         input_id for idx, input_id in enumerate(input_ids)
         if input_id != image_token_id or input_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, " ")
+    hf_output_str = re.sub(fr"({image_token_str})+", " ", output_str)
 
     return hf_input_ids, hf_output_str
 

From 7a1986217db4550e89fe055b1ce49e42bcf25699 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 14 Jun 2024 03:26:12 +0000
Subject: [PATCH 033/181] Remove duplicate processing in async engine

---
 vllm/engine/async_llm_engine.py | 14 +-------------
 1 file changed, 1 insertion(+), 13 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index 34504253bc44..53123215e8f8 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -582,21 +582,9 @@ async def add_request(
         if arrival_time is None:
             arrival_time = time.time()
 
-        if self.engine_use_ray:
-            processed_inputs = await self.engine.process_model_inputs_async \
-                .remote(  # type: ignore
-                    request_id=request_id,
-                    inputs=inputs,
-                    lora_request=lora_request)
-        else:
-            processed_inputs = await self.engine.process_model_inputs_async(
-                request_id=request_id,
-                inputs=inputs,
-                lora_request=lora_request)
-
         stream = self._request_tracker.add_request(
             request_id,
-            inputs=processed_inputs,
+            inputs=inputs,
             params=params,
             arrival_time=arrival_time,
             lora_request=lora_request,

From 18cc7e02d8ec0a65534107ee74f357976621e1df Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 02:49:10 +0000
Subject: [PATCH 034/181] Set up dummy data factory for phi3v

---
 vllm/inputs/__init__.py                  |  4 ++--
 vllm/model_executor/models/llava_next.py |  3 +--
 vllm/model_executor/models/phi3v.py      | 20 +++++++++++++++++---
 vllm/multimodal/base.py                  |  2 +-
 vllm/multimodal/image.py                 | 18 +++++++++---------
 5 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index d251d1ac72f8..637c22394c89 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -1,7 +1,7 @@
 from .data import (LLMInputs, ParsedText, ParsedTokens, PromptInputs,
                    PromptStrictInputs, TextPrompt, TextTokensPrompt,
                    TokensPrompt, parse_and_batch_prompt)
-from .registry import InputRegistry
+from .registry import InputContext, InputRegistry
 
 INPUT_REGISTRY = InputRegistry()
 """
@@ -12,5 +12,5 @@
 __all__ = [
     "ParsedText", "ParsedTokens", "parse_and_batch_prompt", "TextPrompt",
     "TokensPrompt", "TextTokensPrompt", "PromptStrictInputs", "PromptInputs",
-    "LLMInputs", "INPUT_REGISTRY", "InputRegistry"
+    "LLMInputs", "INPUT_REGISTRY", "InputContext", "InputRegistry"
 ]
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 5e765ac468d7..3e8f4e35aa57 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -12,8 +12,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY
-from vllm.inputs.registry import InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e8f190d3fc4f..7106a5aa0525 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -22,6 +22,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -32,7 +33,7 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import get_dummy_image_data
+from vllm.multimodal.image import DummyImageDataFactories
 from vllm.sequence import SamplerOutput
 
 logger = logging.get_logger(__name__)
@@ -269,8 +270,21 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
-@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=32044,
+    )
+    multi_modal_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+    )
+
+    return seq_data, multi_modal_data
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 class Phi3VForCausalLM(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6b4684a54d33..fb0cfe265253 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -3,7 +3,7 @@
                     TypeVar)
 
 from vllm.config import ModelConfig
-from vllm.inputs.registry import InputContext
+from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index c451d89ee952..561332b9318d 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -100,7 +100,7 @@ class DummyImageDataFactories:
     """
 
     @classmethod
-    def _dummy_seq_data_for_clip(
+    def dummy_seq_data_for_clip(
         cls,
         hf_config: CLIPVisionConfig,
         seq_len: int,
@@ -118,7 +118,7 @@ def _dummy_seq_data_for_clip(
         return SequenceData(token_ids)
 
     @classmethod
-    def _dummy_pixel_data_for_clip(
+    def dummy_pixel_data_for_clip(
         cls,
         hf_config: CLIPVisionConfig,
         *,
@@ -135,7 +135,7 @@ def _dummy_pixel_data_for_clip(
         return ImagePixelData(image)
 
     @classmethod
-    def _dummy_feature_data_for_clip(
+    def dummy_feature_data_for_clip(
         cls,
         hf_config: CLIPVisionConfig,
         *,
@@ -161,7 +161,7 @@ def _dummy_data_for_llava(
         vision_config = hf_config.vision_config
 
         if isinstance(vision_config, CLIPVisionConfig):
-            seq_data = cls._dummy_seq_data_for_clip(
+            seq_data = cls.dummy_seq_data_for_clip(
                 vision_config,
                 seq_len,
                 image_token_id=hf_config.image_token_index,
@@ -171,10 +171,10 @@ def _dummy_data_for_llava(
             ImageInputType = VisionLanguageConfig.ImageInputType
             multi_modal_data: MultiModalData
             if image_input_type == ImageInputType.PIXEL_VALUES:
-                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                multi_modal_data = cls.dummy_pixel_data_for_clip(
                     vision_config)
             elif image_input_type == ImageInputType.IMAGE_FEATURES:
-                multi_modal_data = cls._dummy_feature_data_for_clip(
+                multi_modal_data = cls.dummy_feature_data_for_clip(
                     vision_config)
 
             return seq_data, multi_modal_data
@@ -198,7 +198,7 @@ def _dummy_data_for_llava_next(
             hf_config, input_height=dummy_height, input_width=dummy_width)
 
         if isinstance(vision_config, CLIPVisionConfig):
-            seq_data = cls._dummy_seq_data_for_clip(
+            seq_data = cls.dummy_seq_data_for_clip(
                 vision_config,
                 seq_len,
                 image_token_id=hf_config.image_token_index,
@@ -209,13 +209,13 @@ def _dummy_data_for_llava_next(
             ImageInputType = VisionLanguageConfig.ImageInputType
             multi_modal_data: MultiModalData
             if image_input_type == ImageInputType.PIXEL_VALUES:
-                multi_modal_data = cls._dummy_pixel_data_for_clip(
+                multi_modal_data = cls.dummy_pixel_data_for_clip(
                     vision_config,
                     image_width_override=dummy_width,
                     image_height_override=dummy_height,
                 )
             elif image_input_type == ImageInputType.IMAGE_FEATURES:
-                multi_modal_data = cls._dummy_feature_data_for_clip(
+                multi_modal_data = cls.dummy_feature_data_for_clip(
                     vision_config,
                     image_feature_size_override=image_feature_size,
                 )

From 2291617eb7906a5aed6f463ca9e1c0e837ed729c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 02:56:06 +0000
Subject: [PATCH 035/181] Move dummy data factories to model files

---
 vllm/model_executor/models/llava.py      |  37 ++++-
 vllm/model_executor/models/llava_next.py | 106 ++++++++++++-
 vllm/model_executor/models/phi3v.py      |   7 +-
 vllm/multimodal/image.py                 | 180 ++---------------------
 4 files changed, 145 insertions(+), 185 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 4a1d2dcec1ad..342106c3ad72 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -4,11 +4,11 @@
 import torch.nn as nn
 # TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
 # transformers' impl.
-from transformers import CLIPVisionModel, LlavaConfig
+from transformers import CLIPVisionConfig, CLIPVisionModel, LlavaConfig
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY
+from vllm.inputs import INPUT_REGISTRY, InputContext
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -18,7 +18,7 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
 from vllm.multimodal.image import DummyImageDataFactories
 from vllm.sequence import SamplerOutput
 
@@ -85,10 +85,37 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
+def dummy_data_for_llava(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+                vision_config)
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = DummyImageDataFactories.dummy_feature_data_for_clip(
+                vision_config)
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
-@INPUT_REGISTRY.register_dummy_data(
-    DummyImageDataFactories.for_model(LlavaConfig))
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 3e8f4e35aa57..1644d7556957 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -5,7 +5,7 @@
 import torch.nn as nn
 # TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
 # transformers' impl.
-from transformers import CLIPVisionModel, LlavaNextConfig
+from transformers import CLIPVisionConfig, CLIPVisionModel, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
 from typing_extensions import NotRequired
@@ -22,8 +22,9 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import DummyImageDataFactories, ImagePixelData
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal.image import (DummyImageDataFactories, ImagePixelData,
+                                   get_clip_num_patches)
 from vllm.sequence import SamplerOutput
 
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
@@ -59,6 +60,102 @@ class LlavaNextImageFeatureInputs(TypedDict):
                              LlavaNextImageFeatureInputs]
 
 
+def _get_llava_next_num_unpadded_features(
+    height: int,
+    width: int,
+    npatches: int,
+    num_patch_height: int,
+    num_patch_width: int,
+) -> Tuple[int, int]:
+    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
+    current_height = npatches * num_patch_height
+    current_width = npatches * num_patch_width
+
+    aspect_ratio: float = width / height
+    current_aspect_ratio: float = current_width / current_height
+    if aspect_ratio > current_aspect_ratio:
+        new_height = (height * current_width) // width
+        current_height = new_height
+    else:
+        new_width = (width * current_height) // height
+        current_width = new_width
+
+    unpadded_features = current_height * current_width
+    newline_features = current_height
+    return (unpadded_features, newline_features)
+
+
+def _get_llava_next_image_feature_size(
+    hf_config: LlavaNextConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        num_patches = get_clip_num_patches(vision_config)
+        base_feature_size = num_patches * num_patches
+
+        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+            image_size=(input_height, input_width),
+            grid_pinpoints=hf_config.image_grid_pinpoints,
+            patch_size=vision_config.image_size,
+        )
+
+        (
+            unpadded_feature_size,
+            newline_feature_size,
+        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
+                                                  num_patches,
+                                                  num_patch_height,
+                                                  num_patch_width)
+
+        return unpadded_feature_size + newline_feature_size + base_feature_size
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
+def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
+
+    # Result in the max possible feature size
+    dummy_height = dummy_width = 448
+    image_feature_size = _get_llava_next_image_feature_size(
+        hf_config, input_height=dummy_height, input_width=dummy_width)
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+            vision_config,
+            seq_len,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
+
+        image_input_type = multimodal_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+        mm_data: MultiModalData
+        if image_input_type == ImageInputType.PIXEL_VALUES:
+            mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+                vision_config,
+                image_width_override=dummy_width,
+                image_height_override=dummy_height,
+            )
+        elif image_input_type == ImageInputType.IMAGE_FEATURES:
+            mm_data = DummyImageDataFactories.dummy_feature_data_for_clip(
+                vision_config,
+                image_feature_size_override=image_feature_size,
+            )
+
+        return seq_data, mm_data
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 def _pixel_mapper(ctx: InputContext,
                   data: ImagePixelData) -> Dict[str, torch.Tensor]:
     image = data.image
@@ -85,8 +182,7 @@ def _pixel_mapper(ctx: InputContext,
 
 @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_pixel_mapper)
-@INPUT_REGISTRY.register_dummy_data(
-    DummyImageDataFactories.for_model(LlavaNextConfig))
+@INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7106a5aa0525..ccd2adcf5a1c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -276,11 +276,10 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
         seq_len,
         image_token_id=32044,
     )
-    multi_modal_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-    )
+    mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG, )
 
-    return seq_data, multi_modal_data
+    return seq_data, mm_data
 
 
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 561332b9318d..8cd62b3b1b6b 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,15 +1,12 @@
 from functools import lru_cache
-from typing import Dict, Optional, Tuple, Type, Union
+from typing import Dict, Optional, Type, Union
 
 import torch
 from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaConfig, LlavaNextConfig,
-                          PretrainedConfig)
-from transformers.models.llava_next.modeling_llava_next import (
-    get_anyres_image_grid_shape)
+from transformers import CLIPVisionConfig
 
-from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.inputs.registry import DummyDataFactory, InputContext
+from vllm.config import ModelConfig
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
@@ -21,7 +18,7 @@
 _cached_get_image_processor = lru_cache(get_image_processor)
 
 
-def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
+def get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
     image_size = hf_config.image_size
     patch_size = hf_config.patch_size
 
@@ -29,68 +26,11 @@ def _get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
     return image_size // patch_size
 
 
-def _get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
-    num_patches = _get_clip_num_patches(hf_config)
+def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    num_patches = get_clip_num_patches(hf_config)
     return num_patches * num_patches
 
 
-def _get_llava_next_num_unpadded_features(
-    height: int,
-    width: int,
-    npatches: int,
-    num_patch_height: int,
-    num_patch_width: int,
-) -> Tuple[int, int]:
-    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
-    current_height = npatches * num_patch_height
-    current_width = npatches * num_patch_width
-
-    aspect_ratio: float = width / height
-    current_aspect_ratio: float = current_width / current_height
-    if aspect_ratio > current_aspect_ratio:
-        new_height = (height * current_width) // width
-        current_height = new_height
-    else:
-        new_width = (width * current_height) // height
-        current_width = new_width
-
-    unpadded_features = current_height * current_width
-    newline_features = current_height
-    return (unpadded_features, newline_features)
-
-
-def _get_llava_next_image_feature_size(
-    hf_config: LlavaNextConfig,
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    vision_config = hf_config.vision_config
-
-    if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = _get_clip_num_patches(vision_config)
-        base_feature_size = num_patches * num_patches
-
-        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-            image_size=(input_height, input_width),
-            grid_pinpoints=hf_config.image_grid_pinpoints,
-            patch_size=vision_config.image_size,
-        )
-
-        (
-            unpadded_feature_size,
-            newline_feature_size,
-        ) = _get_llava_next_num_unpadded_features(input_height, input_width,
-                                                  num_patches,
-                                                  num_patch_height,
-                                                  num_patch_width)
-
-        return unpadded_feature_size + newline_feature_size + base_feature_size
-
-    msg = f"Unsupported vision config: {type(vision_config)}"
-    raise NotImplementedError(msg)
-
-
 class DummyImageDataFactories:
     """
     Contains factories for dummy image data factories.
@@ -109,7 +49,7 @@ def dummy_seq_data_for_clip(
         image_feature_size_override: Optional[int] = None,
     ):
         if image_feature_size_override is None:
-            image_feature_size = _get_clip_image_feature_size(hf_config)
+            image_feature_size = get_clip_image_feature_size(hf_config)
         else:
             image_feature_size = image_feature_size_override
 
@@ -142,7 +82,7 @@ def dummy_feature_data_for_clip(
         image_feature_size_override: Optional[int] = None,
     ):
         if image_feature_size_override is None:
-            image_feature_size = _get_clip_image_feature_size(hf_config)
+            image_feature_size = get_clip_image_feature_size(hf_config)
         else:
             image_feature_size = image_feature_size_override
 
@@ -150,108 +90,6 @@ def dummy_feature_data_for_clip(
                              dtype=torch.float16)
         return ImageFeatureData(values)
 
-    @classmethod
-    def _dummy_data_for_llava(
-        cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
-        hf_config: LlavaConfig,
-        seq_len: int,
-    ):
-        vision_config = hf_config.vision_config
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            seq_data = cls.dummy_seq_data_for_clip(
-                vision_config,
-                seq_len,
-                image_token_id=hf_config.image_token_index,
-            )
-
-            image_input_type = multimodal_config.image_input_type
-            ImageInputType = VisionLanguageConfig.ImageInputType
-            multi_modal_data: MultiModalData
-            if image_input_type == ImageInputType.PIXEL_VALUES:
-                multi_modal_data = cls.dummy_pixel_data_for_clip(
-                    vision_config)
-            elif image_input_type == ImageInputType.IMAGE_FEATURES:
-                multi_modal_data = cls.dummy_feature_data_for_clip(
-                    vision_config)
-
-            return seq_data, multi_modal_data
-
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    @classmethod
-    def _dummy_data_for_llava_next(
-        cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
-        hf_config: LlavaNextConfig,
-        seq_len: int,
-    ):
-        vision_config = hf_config.vision_config
-
-        # Result in the max possible feature size
-        dummy_height = dummy_width = 448
-        image_feature_size = _get_llava_next_image_feature_size(
-            hf_config, input_height=dummy_height, input_width=dummy_width)
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            seq_data = cls.dummy_seq_data_for_clip(
-                vision_config,
-                seq_len,
-                image_token_id=hf_config.image_token_index,
-                image_feature_size_override=image_feature_size,
-            )
-
-            image_input_type = multimodal_config.image_input_type
-            ImageInputType = VisionLanguageConfig.ImageInputType
-            multi_modal_data: MultiModalData
-            if image_input_type == ImageInputType.PIXEL_VALUES:
-                multi_modal_data = cls.dummy_pixel_data_for_clip(
-                    vision_config,
-                    image_width_override=dummy_width,
-                    image_height_override=dummy_height,
-                )
-            elif image_input_type == ImageInputType.IMAGE_FEATURES:
-                multi_modal_data = cls.dummy_feature_data_for_clip(
-                    vision_config,
-                    image_feature_size_override=image_feature_size,
-                )
-
-            return seq_data, multi_modal_data
-
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    @classmethod
-    def for_model(
-        cls,
-        hf_config_type: Type[PretrainedConfig],
-    ) -> DummyDataFactory:
-        """
-        Create an dummy image data factory for a model as identified
-        by the config type.
-        """
-        if hf_config_type == LlavaConfig:
-            return lambda ctx, seq_len: cls._dummy_data_for_llava(
-                ctx.model_config,
-                ctx.get_multimodal_config(),
-                ctx.get_hf_config(LlavaConfig),
-                seq_len=seq_len,
-            )
-        if hf_config_type == LlavaNextConfig:
-            return lambda ctx, seq_len: cls._dummy_data_for_llava_next(
-                ctx.model_config,
-                ctx.get_multimodal_config(),
-                ctx.get_hf_config(LlavaNextConfig),
-                seq_len=seq_len,
-            )
-
-        msg = f"Unsupported model config: {type(hf_config_type)}"
-        raise NotImplementedError(msg)
-
 
 class ImagePixelData(MultiModalData):
     """

From 9b0386dd08a05700d5669c4095dea5e178c39fdc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 04:45:04 +0000
Subject: [PATCH 036/181] Move input processors to model files

---
 vllm/model_executor/models/llava.py      |  32 ++++++-
 vllm/model_executor/models/llava_next.py |  49 +++++++++-
 vllm/multimodal/image.py                 | 115 ++---------------------
 3 files changed, 80 insertions(+), 116 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 437e357d9034..4bf58aec5c1f 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -8,7 +8,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -19,7 +19,8 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import DummyImageDataFactories, ImageInputProcessors
+from vllm.multimodal.image import (DummyImageDataFactories, ImageFeatureData,
+                                   ImageInputProcessors, ImagePixelData)
 from vllm.sequence import SamplerOutput
 
 from .vlm_base import VisionLanguageModelBase
@@ -113,11 +114,34 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
+def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or not isinstance(
+            multi_modal_data, (ImagePixelData, ImageFeatureData)):
+        return llm_inputs
+
+    model_config = ctx.model_config
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return ImageInputProcessors.input_processor_for_clip(
+            model_config,
+            multimodal_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava)
-@INPUT_REGISTRY.register_input_processor(
-    ImageInputProcessors.for_model(LlavaConfig))
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 1389ab232a24..5831b4749bd0 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -11,7 +11,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -22,8 +22,8 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import (DummyImageDataFactories,
-                                   ImageInputProcessors,
+from vllm.multimodal.image import (DummyImageDataFactories, ImageFeatureData,
+                                   ImageInputProcessors, ImagePixelData,
                                    get_clip_num_patches)
 from vllm.sequence import SamplerOutput
 
@@ -156,11 +156,50 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
+def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or not isinstance(
+            multi_modal_data, (ImagePixelData, ImageFeatureData)):
+        return llm_inputs
+
+    model_config = ctx.model_config
+    multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(LlavaNextConfig)
+    vision_config = hf_config.vision_config
+
+    if isinstance(multi_modal_data, ImagePixelData):
+        image = multi_modal_data.image
+        if isinstance(image, torch.Tensor):
+            _, _, _, height, width = image.shape
+        else:
+            width, height = image.size
+
+        image_feature_size = _get_llava_next_image_feature_size(
+            hf_config, input_height=height, input_width=width)
+    else:
+        image_features = multi_modal_data.image_features
+        image_feature_size = image_features.shape[-2]
+
+    vision_config = hf_config.vision_config
+
+    if isinstance(vision_config, CLIPVisionConfig):
+        return ImageInputProcessors.input_processor_for_clip(
+            model_config,
+            multimodal_config,
+            vision_config,
+            llm_inputs,
+            image_token_id=hf_config.image_token_index,
+            image_feature_size_override=image_feature_size,
+        )
+
+    msg = f"Unsupported vision config: {type(vision_config)}"
+    raise NotImplementedError(msg)
+
+
 @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_llava_next)
-@INPUT_REGISTRY.register_input_processor(
-    ImageInputProcessors.for_model(LlavaNextConfig))
+@INPUT_REGISTRY.register_input_processor(input_processor_for_llava_next)
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index cdccd5850f13..951615727f2d 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -4,11 +4,10 @@
 
 import torch
 from PIL import Image
-from transformers import (CLIPVisionConfig, LlavaConfig, LlavaNextConfig,
-                          PretrainedConfig, PreTrainedTokenizerBase)
+from transformers import CLIPVisionConfig, PreTrainedTokenizerBase
 
 from vllm.config import ModelConfig, VisionLanguageConfig
-from vllm.inputs.registry import InputContext, InputProcessor
+from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
@@ -112,7 +111,7 @@ class ImageInputProcessors:
     """
 
     @classmethod
-    def _repeat_and_pad_token(
+    def repeat_and_pad_token(
         cls,
         token: _T,
         *,
@@ -129,7 +128,7 @@ def _repeat_and_pad_token(
         return replacement
 
     @classmethod
-    def _repeat_and_pad_image_tokens(
+    def repeat_and_pad_image_tokens(
         cls,
         tokenizer: PreTrainedTokenizerBase,
         prompt: Optional[str],
@@ -149,7 +148,7 @@ def _repeat_and_pad_image_tokens(
             pad_token_str_right = (None if pad_token_right is None else
                                    tokenizer.decode(pad_token_right))
             replacement_str = "".join(
-                cls._repeat_and_pad_token(
+                cls.repeat_and_pad_token(
                     image_token_str,
                     repeat_count=repeat_count,
                     pad_token_left=pad_token_str_left,
@@ -162,7 +161,7 @@ def _repeat_and_pad_image_tokens(
         new_token_ids: List[int] = []
         for i, token in enumerate(prompt_token_ids):
             if token == image_token_id:
-                replacement_ids = cls._repeat_and_pad_token(
+                replacement_ids = cls.repeat_and_pad_token(
                     image_token_id,
                     repeat_count=repeat_count,
                     pad_token_left=pad_token_left,
@@ -179,7 +178,7 @@ def _repeat_and_pad_image_tokens(
         return new_prompt, new_token_ids
 
     @classmethod
-    def _input_processor_for_clip(
+    def input_processor_for_clip(
         cls,
         model_config: ModelConfig,
         multimodal_config: VisionLanguageConfig,
@@ -201,7 +200,7 @@ def _input_processor_for_clip(
         else:
             image_feature_size = image_feature_size_override
 
-        new_prompt, new_token_ids = cls._repeat_and_pad_image_tokens(
+        new_prompt, new_token_ids = cls.repeat_and_pad_image_tokens(
             tokenizer,
             llm_inputs.get("prompt"),
             llm_inputs["prompt_token_ids"],
@@ -214,104 +213,6 @@ def _input_processor_for_clip(
                          prompt=new_prompt,
                          multi_modal_data=multi_modal_data)
 
-    @classmethod
-    def _input_processor_for_llava(
-        cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
-        hf_config: LlavaConfig,
-        llm_inputs: LLMInputs,
-    ):
-        multi_modal_data = llm_inputs.get("multi_modal_data")
-        if multi_modal_data is None or not isinstance(
-                multi_modal_data, (ImagePixelData, ImageFeatureData)):
-            return llm_inputs
-
-        vision_config = hf_config.vision_config
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            return cls._input_processor_for_clip(
-                model_config,
-                multimodal_config,
-                vision_config,
-                llm_inputs,
-                image_token_id=hf_config.image_token_index,
-            )
-
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    @classmethod
-    def _input_processor_for_llava_next(
-        cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
-        hf_config: LlavaNextConfig,
-        llm_inputs: LLMInputs,
-    ):
-        multi_modal_data = llm_inputs.get("multi_modal_data")
-        if multi_modal_data is None or not isinstance(
-                multi_modal_data, (ImagePixelData, ImageFeatureData)):
-            return llm_inputs
-
-        if isinstance(multi_modal_data, ImagePixelData):
-            image = multi_modal_data.image
-            if isinstance(image, torch.Tensor):
-                _, _, _, height, width = image.shape
-            else:
-                width, height = image.size
-            
-            from vllm.model_executor.models.llava_next import (
-                _get_llava_next_image_feature_size)
-
-            image_feature_size = _get_llava_next_image_feature_size(
-                hf_config, input_height=height, input_width=width)
-        else:
-            image_features = multi_modal_data.image_features
-            image_feature_size = image_features.shape[-2]
-
-        vision_config = hf_config.vision_config
-
-        if isinstance(vision_config, CLIPVisionConfig):
-            return cls._input_processor_for_clip(
-                model_config,
-                multimodal_config,
-                vision_config,
-                llm_inputs,
-                image_token_id=hf_config.image_token_index,
-                image_feature_size_override=image_feature_size,
-            )
-
-        msg = f"Unsupported vision config: {type(vision_config)}"
-        raise NotImplementedError(msg)
-
-    @classmethod
-    def for_model(
-        cls,
-        hf_config_type: Type[PretrainedConfig],
-    ) -> InputProcessor:
-        """
-        Create an input processor for a model as identified
-        by the config type.
-        """
-        if hf_config_type == LlavaConfig:
-            return lambda ctx, llm_inputs: cls._input_processor_for_llava(
-                ctx.model_config,
-                ctx.get_multimodal_config(),
-                ctx.get_hf_config(LlavaConfig),
-                llm_inputs=llm_inputs,
-            )
-        if hf_config_type == LlavaNextConfig:
-            return lambda ctx, llm_inputs: cls._input_processor_for_llava_next(
-                ctx.model_config,
-                ctx.get_multimodal_config(),
-                ctx.get_hf_config(LlavaNextConfig),
-                llm_inputs=llm_inputs,
-            )
-
-        msg = f"Unsupported model config: {type(hf_config_type)}"
-        raise NotImplementedError(msg)
-
 
 class ImagePixelData(MultiModalData):
     """

From 4e656e7f7b6ed431f36e09e3ae9764532b598f27 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 05:07:01 +0000
Subject: [PATCH 037/181] Set up input processor for phi3v

---
 tests/models/test_phi3v.py          | 10 ++--------
 vllm/model_executor/models/phi3v.py | 23 +++++++++++++++++++++--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 607ad95e8c36..85524e01e26b 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -61,7 +61,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "") \
-        .replace("<s>", " ").replace("<|user|>", "") \
+        .replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
     return hf_input_ids, hf_output_str
@@ -99,18 +99,12 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                                               max_tokens,
                                               images=hf_images)
 
-    vllm_image_prompts = [
-        p.replace("<|image_1|>",
-                  "<|image|>" * vlm_config.image_feature_size + "<s>")
-        for p in HF_IMAGE_PROMPTS
-    ]
-
     with vllm_runner(model_id,
                      max_model_len=2048,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
                                                   max_tokens,
                                                   images=vllm_images)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index ccd2adcf5a1c..b5a3b4506b7f 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -22,7 +22,7 @@
 
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, VisionLanguageConfig
-from vllm.inputs import INPUT_REGISTRY, InputContext
+from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -33,7 +33,8 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import DummyImageDataFactories
+from vllm.multimodal.image import (DummyImageDataFactories, ImageFeatureData,
+                                   ImageInputProcessors, ImagePixelData)
 from vllm.sequence import SamplerOutput
 
 logger = logging.get_logger(__name__)
@@ -282,6 +283,24 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     return seq_data, mm_data
 
 
+def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or not isinstance(
+            multi_modal_data, (ImagePixelData, ImageFeatureData)):
+        return llm_inputs
+
+    model_config = ctx.model_config
+    multimodal_config = ctx.get_multimodal_config()
+
+    return ImageInputProcessors.input_processor_for_clip(
+        model_config,
+        multimodal_config,
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        llm_inputs,
+        image_token_id=32044,
+    )
+
+
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 class Phi3VForCausalLM(VisionLanguageModelBase):

From fecf1f0f50bbaa68f77d94d8044da08e1e2e87a4 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 05:22:08 +0000
Subject: [PATCH 038/181] Fix wrong feature size

---
 vllm/model_executor/models/phi3v.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index ccd2adcf5a1c..3f3cedc2f594 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -275,9 +275,10 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         image_token_id=32044,
+        image_feature_size_override=1921,
     )
     mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG, )
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG)
 
     return seq_data, mm_data
 

From 086e0fe81d270a01b37ff5ab6df304f525f0f06b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 18 Jun 2024 07:17:19 +0000
Subject: [PATCH 039/181] Fix wrong feature size

---
 vllm/model_executor/models/phi3v.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 3f3cedc2f594..03e38d1fe26b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -278,7 +278,10 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
         image_feature_size_override=1921,
     )
     mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG)
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=1344,
+        image_height_override=1008,
+    )
 
     return seq_data, mm_data
 

From 81522fee5b2d72c012f5b3c3cbae4edee574b6fc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 19 Jun 2024 02:42:23 +0000
Subject: [PATCH 040/181] Fix wrong feature size

---
 vllm/model_executor/models/phi3v.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 1089761b5f4f..983c6b729231 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -296,12 +296,14 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     model_config = ctx.model_config
     multimodal_config = ctx.get_multimodal_config()
 
+    # TODO: Dynamic feature size
     return ImageInputProcessors.input_processor_for_clip(
         model_config,
         multimodal_config,
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         llm_inputs,
         image_token_id=32044,
+        image_feature_size_override=1921,
     )
 
 

From b24e8d958300282e0b060e1e0a3a8edb24dd35c9 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 24 Jun 2024 14:04:52 +0000
Subject: [PATCH 041/181] Update validation

---
 vllm/model_executor/models/llava_next.py | 30 +++--------------------
 vllm/model_executor/models/phi3v.py      | 31 ++++++++++++++++++------
 2 files changed, 28 insertions(+), 33 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 514e8d52261a..e8007d9df57a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -46,19 +46,6 @@ class LlavaNextImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-class LlavaNextImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
-
-    image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
-
-
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
-                             LlavaNextImageFeatureInputs]
-
-
 def _get_llava_next_num_unpadded_features(
     height: int,
     width: int,
@@ -208,7 +195,6 @@ def __init__(self,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__(vision_language_config)
 
-        # Update the type annotation from that of its superclass
         self.config = config
 
         if self.vision_language_config.image_input_type == (
@@ -265,18 +251,14 @@ def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         return data
 
     def _parse_and_validate_image_input(
-            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+            self, **kwargs: object) -> Optional[LlavaNextImagePixelInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
-        image_features = kwargs.pop("image_features", None)
 
         expected_input_type = self.vision_language_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
             if pixel_values is None:
                 return None
 
@@ -403,12 +385,8 @@ def _process_image_pixels(
                                            *stacked_image_features.shape[-2:])
 
     def _process_image_input(
-            self, image_input: LlavaNextImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
+            self, image_input: LlavaNextImagePixelInputs) -> torch.Tensor:
+        image_features = self._process_image_pixels(image_input)
 
         patch_embeddings = self.multi_modal_projector(image_features)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 487e6ad4a0ff..c4b5ad26b27c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -353,10 +353,18 @@ def __init__(self,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
         super().__init__(vision_language_config)
+
         self.config = config
+
         self.model = LlamaModel(config, cache_config, quant_config)
-        self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vision_language_config, config, self.model.embed_tokens)
+
+        if self.vision_language_config.image_input_type == (
+                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
+            self.vision_embed_tokens = Phi3HDImageEmbedding(
+                vision_language_config, config, self.model.embed_tokens)
+        else:
+            raise TypeError("Image features are not supported by LLaVA-NeXT")
+
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -369,16 +377,25 @@ def _parse_and_validate_image_input(
         expected_input_type = self.vision_language_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
-        if expected_input_type != ImageInputType.PIXEL_VALUES:
-            raise ValueError(
-                f"Unexpected image input type: {expected_input_type}."
-                "Phi3v only support pixel_values input currently.")
+        if expected_input_type == ImageInputType.PIXEL_VALUES:
+            if pixel_values is None:
+                return None
+
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, torch.Tensor):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
 
-        if pixel_values is not None and image_sizes is not None:
             return Phi3VImagePixelInputs(type="pixel_values",
                                          data=pixel_values,
                                          image_sizes=image_sizes)
 
+        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
+            "Failed to validate this at initialization time")
+
         return None
 
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,

From 8569d35da001d30b0099b6e27935d51f9b2aa8ae Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 24 Jun 2024 15:45:31 +0000
Subject: [PATCH 042/181] Fix image feature calculation for phi3v

---
 tests/models/test_phi3v.py          |  15 ++---
 vllm/model_executor/models/phi3v.py | 101 +++++++++++++++++++++-------
 2 files changed, 82 insertions(+), 34 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index f98472dc1d26..069430928512 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,3 +1,4 @@
+import re
 from typing import List, Tuple
 
 import pytest
@@ -54,16 +55,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id)
+    image_token_str = tokenizer.decode(image_token_id).replace("|", r"\|")
 
     hf_input_ids = [
         input_id if input_id != image_token_id else 0
         for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "") \
-        .replace("<|user|>", "") \
+    hf_output_str = output_str.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
+    hf_output_str = re.sub(fr"({image_token_str})+", " ", hf_output_str)
+    hf_output_str = re.sub(r"(<\|image_\d+\|>)+", " ", hf_output_str)
 
     return hf_input_ids, hf_output_str
 
@@ -74,11 +76,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-# Since we use _attn_implementation="eager" for hf_runner, here is
-# numeric difference for longer context and test can't pass
-@pytest.mark.xfail(
-    reason="Inconsistent image processor being used due to lack "
-    "of support for dynamic image token replacement")
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index c4b5ad26b27c..7046c603e24d 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,7 +35,8 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import (DummyImageDataFactories, ImageFeatureData,
-                                   ImageInputProcessors, ImagePixelData)
+                                   ImageInputProcessors, ImagePixelData,
+                                   _cached_get_tokenizer)
 from vllm.sequence import SamplerOutput
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -287,29 +288,8 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     return seq_data, mm_data
 
 
-def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
-    multi_modal_data = llm_inputs.get("multi_modal_data")
-    if multi_modal_data is None or not isinstance(
-            multi_modal_data, (ImagePixelData, ImageFeatureData)):
-        return llm_inputs
-
-    model_config = ctx.model_config
-    multimodal_config = ctx.get_multimodal_config()
-
-    # TODO: Dynamic feature size
-    return ImageInputProcessors.input_processor_for_clip(
-        model_config,
-        multimodal_config,
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        llm_inputs,
-        image_token_id=32044,
-        image_feature_size_override=1921,
-    )
-
-
-# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
 # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_padded_size(width, height, padding_unit=336):
+def _calc_padded_size(width, height, padding_unit=336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
     top_padding = int((target_height - height) / 2)
     bottom_padding = target_height - height - top_padding
@@ -319,7 +299,7 @@ def calc_padded_size(width, height, padding_unit=336):
 
 
 # copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_hd_transform_size(width, height, hd_num=16):
+def _calc_hd_transform_size(width, height, hd_num=16):
     transposed = False
     if width < height:
         width, height = height, width
@@ -334,7 +314,7 @@ def calc_hd_transform_size(width, height, hd_num=16):
     new_width = int(scale * 336)
     new_height = int(new_width / ratio)
 
-    padded_width, padded_height = calc_padded_size(new_width, new_height)
+    padded_width, padded_height = _calc_padded_size(new_width, new_height)
 
     if transposed:
         padded_width, padded_height = padded_height, padded_width
@@ -342,6 +322,77 @@ def calc_hd_transform_size(width, height, hd_num=16):
     return padded_width, padded_height
 
 
+def _get_phi3v_image_feature_size(
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    h, w = input_height, input_width
+
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
+    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+
+
+def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or not isinstance(
+            multi_modal_data, (ImagePixelData, ImageFeatureData)):
+        return llm_inputs
+
+    model_config = ctx.model_config
+    multimodal_config = ctx.get_multimodal_config()
+
+    if isinstance(multi_modal_data, ImagePixelData):
+        image = multi_modal_data.image
+        if isinstance(image, torch.Tensor):
+            _, _, _, h, w = image.shape
+        else:
+            w, h = image.size
+        
+        w, h = _calc_hd_transform_size(w, h)
+
+        image_feature_size = _get_phi3v_image_feature_size(input_width=w,
+                                                           input_height=h)
+    else:
+        image_features = multi_modal_data.image_features
+        image_feature_size = image_features.shape[-2]
+    
+    prompt_token_ids = llm_inputs["prompt_token_ids"]
+    tokenizer = _cached_get_tokenizer(model_config.tokenizer)
+
+    # We need to get the token for "<", not "▁<"
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
+    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
+    a_token_id_, *image_1_token_ids = tokenizer.encode("a<|image_1|>",
+                                                       add_special_tokens=False)
+    assert a_token_id == a_token_id_
+
+    new_token_ids: List[int] = []
+    for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):
+        if prompt_token_ids[i:i+len(image_1_token_ids)] == image_1_token_ids:
+            new_token_ids.append(multimodal_config.image_token_id)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + len(image_1_token_ids):])
+            break
+        else:
+            new_token_ids.append(prompt_token_ids[i])
+
+    # NOTE: Create a defensive copy of the original inputs
+    llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
+                           prompt=llm_inputs.get("prompt"),
+                           multi_modal_data=multi_modal_data)
+
+    return ImageInputProcessors.input_processor_for_clip(
+        model_config,
+        multimodal_config,
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        llm_inputs,
+        image_token_id=multimodal_config.image_token_id,
+        image_feature_size_override=image_feature_size,
+    )
+
+
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 @INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)

From bfa5aa9f737e75234c5a2369e2ac2ac15032f772 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 24 Jun 2024 15:49:50 +0000
Subject: [PATCH 043/181] Remove redundant code

---
 vllm/model_executor/models/clip.py | 10 +++++-----
 vllm/multimodal/image.py           | 13 +++----------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index aa4e87228a7e..a653163818b7 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -1,6 +1,6 @@
 """Minimal implementation of CLIPVisionModel intended to be only used 
 within a vision language model."""
-from typing import Optional, Tuple
+from typing import Optional
 
 import torch
 import torch.nn as nn
@@ -14,7 +14,7 @@
     QuantizationConfig)
 
 
-def get_clip_num_patches(image_size: int, patch_size: int) -> int:
+def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
     return (image_size // patch_size)**2
 
@@ -39,8 +39,8 @@ def __init__(self, config: CLIPVisionConfig):
             bias=False,
         )
 
-        self.num_patches = get_clip_num_patches(self.image_size,
-                                                self.patch_size)
+        self.num_patches = get_clip_num_patches(image_size=self.image_size,
+                                                patch_size=self.patch_size)
         self.num_positions = self.num_patches + 1
         self.position_embedding = nn.Embedding(self.num_positions,
                                                self.embed_dim)
@@ -101,7 +101,7 @@ def __init__(self,
         self.layer_norm2 = nn.LayerNorm(config.hidden_size,
                                         eps=config.layer_norm_eps)
 
-    def forward(self, hidden_states: torch.Tensor) -> Tuple[torch.Tensor]:
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 
         residual = hidden_states
 
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 8cd62b3b1b6b..bffbf2a48896 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -7,6 +7,7 @@
 
 from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
+from vllm.model_executor.models.clip import get_clip_num_patches
 from vllm.logger import init_logger
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
@@ -18,17 +19,9 @@
 _cached_get_image_processor = lru_cache(get_image_processor)
 
 
-def get_clip_num_patches(hf_config: CLIPVisionConfig) -> int:
-    image_size = hf_config.image_size
-    patch_size = hf_config.patch_size
-
-    assert image_size % patch_size == 0
-    return image_size // patch_size
-
-
 def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
-    num_patches = get_clip_num_patches(hf_config)
-    return num_patches * num_patches
+    return get_clip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
 
 
 class DummyImageDataFactories:

From 07e695db7e3066b58c1bf009826d1a0cc4cc7232 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 24 Jun 2024 15:51:51 +0000
Subject: [PATCH 044/181] Apply isort

---
 vllm/multimodal/image.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index bffbf2a48896..723c09817c36 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -7,8 +7,8 @@
 
 from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
-from vllm.model_executor.models.clip import get_clip_num_patches
 from vllm.logger import init_logger
+from vllm.model_executor.models.clip import get_clip_num_patches
 from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
 

From 825401d94d2d805586e56f2f4629aaf98f20692f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 24 Jun 2024 15:55:05 +0000
Subject: [PATCH 045/181] Apply yapf

---
 vllm/model_executor/models/phi3v.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 7046c603e24d..fafa0b79c4e3 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -348,7 +348,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
             _, _, _, h, w = image.shape
         else:
             w, h = image.size
-        
+
         w, h = _calc_hd_transform_size(w, h)
 
         image_feature_size = _get_phi3v_image_feature_size(input_width=w,
@@ -356,20 +356,20 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     else:
         image_features = multi_modal_data.image_features
         image_feature_size = image_features.shape[-2]
-    
+
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     tokenizer = _cached_get_tokenizer(model_config.tokenizer)
 
     # We need to get the token for "<", not "▁<"
     # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
     a_token_id, = tokenizer.encode("a", add_special_tokens=False)
-    a_token_id_, *image_1_token_ids = tokenizer.encode("a<|image_1|>",
-                                                       add_special_tokens=False)
+    a_token_id_, *image_1_token_ids = tokenizer.encode(
+        "a<|image_1|>", add_special_tokens=False)
     assert a_token_id == a_token_id_
 
     new_token_ids: List[int] = []
     for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):
-        if prompt_token_ids[i:i+len(image_1_token_ids)] == image_1_token_ids:
+        if prompt_token_ids[i:i + len(image_1_token_ids)] == image_1_token_ids:
             new_token_ids.append(multimodal_config.image_token_id)
 
             # No need to further scan the list since we only replace once

From 4a0d4d190821fa9f12bd29bf21bba87812e31841 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 02:55:00 +0000
Subject: [PATCH 046/181] Reduce `max_tokens` so that test still passes

---
 tests/models/test_phi3v.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 069430928512..6c36600bf7a1 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -75,10 +75,11 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     target_dtype = "bfloat16"
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+# Since we use _attn_implementation="eager" for hf_runner, there is numeric
+# difference for longer context (max_tokens=128) and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("max_tokens", [8])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.

From 8d22fe0f2205233cf4cc67af58aa5b228a67c2d7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 04:25:51 +0000
Subject: [PATCH 047/181] Fix vllm to hf output (+ rename)

---
 tests/models/test_llava.py      | 10 +++++-----
 tests/models/test_llava_next.py | 10 +++++-----
 tests/models/test_phi3v.py      | 25 +++++++++++++------------
 3 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 2ebe4367bfea..34f02799f68c 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -49,20 +49,20 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "")
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index e4b2c3b2e8e4..b79a03c31bb1 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -57,19 +57,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
 
-    hf_input_ids = [
-        input_id for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    hf_output_ids = [
+        token_id for idx, token_id in enumerate(output_ids)
+        if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = re.sub(fr"({image_token_str})+", " ", output_str)
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 6c36600bf7a1..11d14067d580 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -2,7 +2,6 @@
 from typing import List, Tuple
 
 import pytest
-from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
 from vllm.utils import is_cpu
@@ -51,23 +50,25 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    input_ids, output_str = vllm_output
+    output_ids, output_str = vllm_output
     image_token_id = vlm_config.image_token_id
 
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(image_token_id).replace("|", r"\|")
+    hf_output_ids: List[int] = []
+    for idx, token_id in enumerate(output_ids):
+        if token_id != image_token_id:
+            hf_output_ids.append(token_id)
+        else:
+            hf_output_ids.append(0)
+
+            if output_ids[idx + 1] != image_token_id:
+                hf_output_ids.extend([1, 29871])
 
-    hf_input_ids = [
-        input_id if input_id != image_token_id else 0
-        for idx, input_id in enumerate(input_ids)
-        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
-    ]
     hf_output_str = output_str.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
-    hf_output_str = re.sub(fr"({image_token_str})+", " ", hf_output_str)
+    hf_output_str = re.sub(r"(<\|image\|>)+", " ", hf_output_str)
     hf_output_str = re.sub(r"(<\|image_\d+\|>)+", " ", hf_output_str)
 
-    return hf_input_ids, hf_output_str
+    return hf_output_ids, hf_output_str
 
 
 target_dtype = "half"
@@ -79,7 +80,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # difference for longer context (max_tokens=128) and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [4])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.

From 2e1ee2f08c0f1c17e7bcb6d1c1a6bce35768a2c7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 07:14:28 +0000
Subject: [PATCH 048/181] Fix wrong arguments

---
 vllm/model_executor/models/llava_next.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index e8007d9df57a..dfaf6f122a2c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -22,10 +22,10 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
 from vllm.multimodal.image import (DummyImageDataFactories, ImageFeatureData,
-                                   ImageInputProcessors, ImagePixelData,
-                                   get_clip_num_patches)
+                                   ImageInputProcessors, ImagePixelData)
 from vllm.sequence import SamplerOutput
 
+from .clip import get_clip_num_patches
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 from .vlm_base import VisionLanguageModelBase
 
@@ -80,7 +80,8 @@ def _get_llava_next_image_feature_size(
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_num_patches(vision_config)
+        num_patches = get_clip_num_patches(image_size=vision_config.image_size,
+                                           patch_size=vision_config.patch_size)
         base_feature_size = num_patches * num_patches
 
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(

From 7229b076acccfd9db13827711adb39720193dc48 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 07:26:08 +0000
Subject: [PATCH 049/181] Move `DummyImageDataFactories` into CLIP model file

---
 vllm/model_executor/models/clip.py       | 69 ++++++++++++++++++++--
 vllm/model_executor/models/llava.py      | 11 ++--
 vllm/model_executor/models/llava_next.py | 16 +++--
 vllm/model_executor/models/phi3v.py      |  8 ++-
 vllm/multimodal/image.py                 | 74 +-----------------------
 5 files changed, 88 insertions(+), 90 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index a653163818b7..77fbade056ee 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -4,19 +4,80 @@
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
-from vllm.model_executor.layers.quantization.base_config import (
-    QuantizationConfig)
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SequenceData
 
 
-def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
+def get_clip_patch_grid_length(*, image_size: int, patch_size: int) -> int:
     assert image_size % patch_size == 0
-    return (image_size // patch_size)**2
+    return image_size // patch_size
+
+
+def get_clip_num_patches(*, image_size: int, patch_size: int) -> int:
+    grid_length = get_clip_patch_grid_length(image_size=image_size,
+                                             patch_size=patch_size)
+    return grid_length * grid_length
+
+
+def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
+    return get_clip_num_patches(image_size=hf_config.image_size,
+                                patch_size=hf_config.patch_size)
+
+
+def dummy_seq_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    seq_len: int,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    token_ids = [image_token_id] * image_feature_size
+    token_ids += [0] * (seq_len - image_feature_size)
+    return SequenceData(token_ids)
+
+
+def dummy_pixel_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_width_override: Optional[int] = None,
+    image_height_override: Optional[int] = None,
+):
+    width = height = hf_config.image_size
+    if image_width_override is not None:
+        width = image_width_override
+    if image_height_override is not None:
+        height = image_height_override
+
+    image = Image.new("RGB", (width, height), color=0)
+    return ImagePixelData(image)
+
+
+def dummy_feature_data_for_clip(
+    hf_config: CLIPVisionConfig,
+    *,
+    image_feature_size_override: Optional[int] = None,
+):
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
+                         dtype=torch.float16)
+    return ImageFeatureData(values)
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index d547a748689b..90b2d76b41db 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,9 +18,10 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import DummyImageDataFactories
 from vllm.sequence import SamplerOutput
 
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip)
 from .vlm_base import VisionLanguageModelBase
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -90,7 +91,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+        seq_data = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             image_token_id=hf_config.image_token_index,
@@ -100,11 +101,9 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
         ImageInputType = VisionLanguageConfig.ImageInputType
         mm_data: MultiModalData
         if image_input_type == ImageInputType.PIXEL_VALUES:
-            mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
-                vision_config)
+            mm_data = dummy_pixel_data_for_clip(vision_config)
         elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            mm_data = DummyImageDataFactories.dummy_feature_data_for_clip(
-                vision_config)
+            mm_data = dummy_feature_data_for_clip(vision_config)
 
         return seq_data, mm_data
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index bb7e0d2e821f..b3d2b42ec1d4 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -22,10 +22,11 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import (DummyImageDataFactories, ImagePixelData,
-                                   get_clip_num_patches)
+from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput
 
+from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
+                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 from .vlm_base import VisionLanguageModelBase
 
@@ -93,7 +94,10 @@ def _get_llava_next_image_feature_size(
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
-        num_patches = get_clip_num_patches(vision_config)
+        num_patches = get_clip_patch_grid_length(
+            image_size=vision_config.image_size,
+            patch_size=vision_config.patch_size,
+        )
         base_feature_size = num_patches * num_patches
 
         num_patch_height, num_patch_width = get_anyres_image_grid_shape(
@@ -127,7 +131,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
         hf_config, input_height=dummy_height, input_width=dummy_width)
 
     if isinstance(vision_config, CLIPVisionConfig):
-        seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+        seq_data = dummy_seq_data_for_clip(
             vision_config,
             seq_len,
             image_token_id=hf_config.image_token_index,
@@ -138,13 +142,13 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
         ImageInputType = VisionLanguageConfig.ImageInputType
         mm_data: MultiModalData
         if image_input_type == ImageInputType.PIXEL_VALUES:
-            mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+            mm_data = dummy_pixel_data_for_clip(
                 vision_config,
                 image_width_override=dummy_width,
                 image_height_override=dummy_height,
             )
         elif image_input_type == ImageInputType.IMAGE_FEATURES:
-            mm_data = DummyImageDataFactories.dummy_feature_data_for_clip(
+            mm_data = dummy_feature_data_for_clip(
                 vision_config,
                 image_feature_size_override=image_feature_size,
             )
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 763a6d46ac51..16b375613178 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -36,9 +36,11 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import DummyImageDataFactories, ImagePixelData
+from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput
 
+from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+
 logger = init_logger(__name__)
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -275,13 +277,13 @@ class Phi3VImagePixelInputs(TypedDict):
 
 
 def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
-    seq_data = DummyImageDataFactories.dummy_seq_data_for_clip(
+    seq_data = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         image_token_id=32044,
         image_feature_size_override=1921,
     )
-    mm_data = DummyImageDataFactories.dummy_pixel_data_for_clip(
+    mm_data = dummy_pixel_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         image_width_override=1344,
         image_height_override=1008,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 723c09817c36..a9691575c2ea 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,87 +1,19 @@
 from functools import lru_cache
-from typing import Dict, Optional, Type, Union
+from typing import Dict, Type, Union
 
 import torch
 from PIL import Image
-from transformers import CLIPVisionConfig
 
 from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.model_executor.models.clip import get_clip_num_patches
-from vllm.sequence import SequenceData
 from vllm.transformers_utils.image_processor import get_image_processor
 
 from .base import MultiModalData, MultiModalPlugin
 
 logger = init_logger(__name__)
 
-_cached_get_image_processor = lru_cache(get_image_processor)
-
-
-def get_clip_image_feature_size(hf_config: CLIPVisionConfig) -> int:
-    return get_clip_num_patches(image_size=hf_config.image_size,
-                                patch_size=hf_config.patch_size)
-
-
-class DummyImageDataFactories:
-    """
-    Contains factories for dummy image data factories.
-
-    See Also:
-        :data:`vllm.inputs.registry.DummyDataFactory`
-    """
-
-    @classmethod
-    def dummy_seq_data_for_clip(
-        cls,
-        hf_config: CLIPVisionConfig,
-        seq_len: int,
-        *,
-        image_token_id: int,
-        image_feature_size_override: Optional[int] = None,
-    ):
-        if image_feature_size_override is None:
-            image_feature_size = get_clip_image_feature_size(hf_config)
-        else:
-            image_feature_size = image_feature_size_override
-
-        token_ids = [image_token_id] * image_feature_size
-        token_ids += [0] * (seq_len - image_feature_size)
-        return SequenceData(token_ids)
-
-    @classmethod
-    def dummy_pixel_data_for_clip(
-        cls,
-        hf_config: CLIPVisionConfig,
-        *,
-        image_width_override: Optional[int] = None,
-        image_height_override: Optional[int] = None,
-    ):
-        width = height = hf_config.image_size
-        if image_width_override is not None:
-            width = image_width_override
-        if image_height_override is not None:
-            height = image_height_override
-
-        image = Image.new("RGB", (width, height), color=0)
-        return ImagePixelData(image)
-
-    @classmethod
-    def dummy_feature_data_for_clip(
-        cls,
-        hf_config: CLIPVisionConfig,
-        *,
-        image_feature_size_override: Optional[int] = None,
-    ):
-        if image_feature_size_override is None:
-            image_feature_size = get_clip_image_feature_size(hf_config)
-        else:
-            image_feature_size = image_feature_size_override
-
-        values = torch.zeros((1, image_feature_size, hf_config.hidden_size),
-                             dtype=torch.float16)
-        return ImageFeatureData(values)
+cached_get_image_processor = lru_cache(get_image_processor)
 
 
 class ImagePixelData(MultiModalData):
@@ -120,7 +52,7 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
         if vlm_config is None or vlm_config.image_processor is None:
             return None
 
-        return _cached_get_image_processor(
+        return cached_get_image_processor(
             vlm_config.image_processor,
             trust_remote_code=model_config.trust_remote_code,
             revision=vlm_config.image_processor_revision,

From 50f994be1d50b238fe4fe826cb5bcd329790a90a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 07:36:23 +0000
Subject: [PATCH 050/181] Move `input_processor_for_clip` into CLIP

---
 vllm/model_executor/models/clip.py       |  41 ++++-
 vllm/model_executor/models/llava.py      |   7 +-
 vllm/model_executor/models/llava_next.py |   8 +-
 vllm/model_executor/models/phi3v.py      |  11 +-
 vllm/multimodal/image.py                 | 185 ++++++++---------------
 5 files changed, 119 insertions(+), 133 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 77fbade056ee..6b4b91c671cc 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -8,11 +8,15 @@
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.multimodal.image import (ImageFeatureData, ImagePixelData,
+                                   cached_get_tokenizer,
+                                   repeat_and_pad_image_tokens)
 from vllm.sequence import SequenceData
 
 
@@ -80,6 +84,41 @@ def dummy_feature_data_for_clip(
     return ImageFeatureData(values)
 
 
+def input_processor_for_clip(
+    model_config: ModelConfig,
+    multimodal_config: VisionLanguageConfig,
+    hf_config: CLIPVisionConfig,
+    llm_inputs: LLMInputs,
+    *,
+    image_token_id: int,
+    image_feature_size_override: Optional[int] = None,
+):
+    multi_modal_data = llm_inputs.get("multi_modal_data")
+    if multi_modal_data is None or not isinstance(
+            multi_modal_data, (ImagePixelData, ImageFeatureData)):
+        return llm_inputs
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    if image_feature_size_override is None:
+        image_feature_size = get_clip_image_feature_size(hf_config)
+    else:
+        image_feature_size = image_feature_size_override
+
+    new_prompt, new_token_ids = repeat_and_pad_image_tokens(
+        tokenizer,
+        llm_inputs.get("prompt"),
+        llm_inputs["prompt_token_ids"],
+        image_token_id=image_token_id,
+        repeat_count=image_feature_size,
+    )
+
+    # NOTE: Create a defensive copy of the original inputs
+    return LLMInputs(prompt_token_ids=new_token_ids,
+                     prompt=new_prompt,
+                     multi_modal_data=multi_modal_data)
+
+
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
 class CLIPVisionEmbeddings(nn.Module):
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 7536220a0daa..a924214ad6b6 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -18,12 +18,11 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import (ImageFeatureData, ImageInputProcessors,
-                                   ImagePixelData)
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SamplerOutput
 
 from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
-                   dummy_seq_data_for_clip)
+                   dummy_seq_data_for_clip, input_processor_for_clip)
 from .vlm_base import VisionLanguageModelBase
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -125,7 +124,7 @@ def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
-        return ImageInputProcessors.input_processor_for_clip(
+        return input_processor_for_clip(
             model_config,
             multimodal_config,
             vision_config,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 4bd133cbf9d3..859b465e152c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -21,12 +21,12 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import (ImageFeatureData, ImageInputProcessors,
-                                   ImagePixelData)
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SamplerOutput
 
 from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
-                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
+                   dummy_seq_data_for_clip, get_clip_patch_grid_length,
+                   input_processor_for_clip)
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 from .vlm_base import VisionLanguageModelBase
 
@@ -173,7 +173,7 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
-        return ImageInputProcessors.input_processor_for_clip(
+        return input_processor_for_clip(
             model_config,
             multimodal_config,
             vision_config,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 6beedf6cefef..4e81ec75502c 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -34,11 +34,12 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import (ImageFeatureData, ImageInputProcessors,
-                                   ImagePixelData, _cached_get_tokenizer)
+from vllm.multimodal.image import (ImageFeatureData, ImagePixelData,
+                                   cached_get_tokenizer)
 from vllm.sequence import SamplerOutput
 
-from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .clip import (dummy_pixel_data_for_clip, dummy_seq_data_for_clip,
+                   input_processor_for_clip)
 
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
@@ -359,7 +360,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         image_feature_size = image_features.shape[-2]
 
     prompt_token_ids = llm_inputs["prompt_token_ids"]
-    tokenizer = _cached_get_tokenizer(model_config.tokenizer)
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
 
     # We need to get the token for "<", not "▁<"
     # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
@@ -384,7 +385,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
                            prompt=llm_inputs.get("prompt"),
                            multi_modal_data=multi_modal_data)
 
-    return ImageInputProcessors.input_processor_for_clip(
+    return input_processor_for_clip(
         model_config,
         multimodal_config,
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 023a7dff21d7..d9ec221563e3 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,143 +1,90 @@
 from functools import lru_cache
-from typing import (TYPE_CHECKING, Dict, List, Optional, Tuple, Type, TypeVar,
-                    Union)
+from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
 
 import torch
 from PIL import Image
-from transformers import CLIPVisionConfig, PreTrainedTokenizerBase
+from transformers import PreTrainedTokenizerBase
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.inputs.registry import InputContext
 from vllm.logger import init_logger
-from vllm.model_executor.models.clip import get_clip_image_feature_size
 from vllm.transformers_utils.image_processor import get_image_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
 from .base import MultiModalData, MultiModalPlugin
 
-if TYPE_CHECKING:
-    from vllm.inputs import LLMInputs
-else:
-    LLMInputs = dict
-
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
-_cached_get_tokenizer = lru_cache(get_tokenizer)
+cached_get_tokenizer = lru_cache(get_tokenizer)
 
+# Utilities for image input processors
 _T = TypeVar("_T", str, int)
 
 
-class ImageInputProcessors:
-    """
-    Contains factories for image input processors.
-
-    See Also:
-        :data:`vllm.inputs.registry.InputProcessor`
-    """
-
-    @classmethod
-    def repeat_and_pad_token(
-        cls,
-        token: _T,
-        *,
-        repeat_count: int = 1,
-        pad_token_left: Optional[_T] = None,
-        pad_token_right: Optional[_T] = None,
-    ) -> List[_T]:
-        replacement = [token] * repeat_count
-        if pad_token_left is not None:
-            replacement = [pad_token_left] + replacement
-        if pad_token_right is not None:
-            replacement = replacement + [pad_token_right]
-
-        return replacement
-
-    @classmethod
-    def repeat_and_pad_image_tokens(
-        cls,
-        tokenizer: PreTrainedTokenizerBase,
-        prompt: Optional[str],
-        prompt_token_ids: List[int],
-        *,
-        image_token_id: int,
-        repeat_count: int = 1,
-        pad_token_left: Optional[int] = None,
-        pad_token_right: Optional[int] = None,
-    ) -> Tuple[Optional[str], List[int]]:
-        if prompt is None:
-            new_prompt = None
-        else:
-            image_token_str = tokenizer.decode(image_token_id)
-            pad_token_str_left = (None if pad_token_left is None else
-                                  tokenizer.decode(pad_token_left))
-            pad_token_str_right = (None if pad_token_right is None else
-                                   tokenizer.decode(pad_token_right))
-            replacement_str = "".join(
-                cls.repeat_and_pad_token(
-                    image_token_str,
-                    repeat_count=repeat_count,
-                    pad_token_left=pad_token_str_left,
-                    pad_token_right=pad_token_str_right,
-                ))
-
-            # The image tokens are removed to be consistent with HuggingFace
-            new_prompt = prompt.replace(image_token_str, replacement_str, 1)
-
-        new_token_ids: List[int] = []
-        for i, token in enumerate(prompt_token_ids):
-            if token == image_token_id:
-                replacement_ids = cls.repeat_and_pad_token(
-                    image_token_id,
-                    repeat_count=repeat_count,
-                    pad_token_left=pad_token_left,
-                    pad_token_right=pad_token_right,
-                )
-                new_token_ids.extend(replacement_ids)
-
-                # No need to further scan the list since we only replace once
-                new_token_ids.extend(prompt_token_ids[i + 1:])
-                break
-            else:
-                new_token_ids.append(token)
-
-        return new_prompt, new_token_ids
-
-    @classmethod
-    def input_processor_for_clip(
-        cls,
-        model_config: ModelConfig,
-        multimodal_config: VisionLanguageConfig,
-        hf_config: CLIPVisionConfig,
-        llm_inputs: LLMInputs,
-        *,
-        image_token_id: int,
-        image_feature_size_override: Optional[int] = None,
-    ):
-        multi_modal_data = llm_inputs.get("multi_modal_data")
-        if multi_modal_data is None or not isinstance(
-                multi_modal_data, (ImagePixelData, ImageFeatureData)):
-            return llm_inputs
-
-        tokenizer = _cached_get_tokenizer(model_config.tokenizer)
-
-        if image_feature_size_override is None:
-            image_feature_size = get_clip_image_feature_size(hf_config)
+def repeat_and_pad_token(
+    token: _T,
+    *,
+    repeat_count: int = 1,
+    pad_token_left: Optional[_T] = None,
+    pad_token_right: Optional[_T] = None,
+) -> List[_T]:
+    replacement = [token] * repeat_count
+    if pad_token_left is not None:
+        replacement = [pad_token_left] + replacement
+    if pad_token_right is not None:
+        replacement = replacement + [pad_token_right]
+
+    return replacement
+
+
+def repeat_and_pad_image_tokens(
+    tokenizer: PreTrainedTokenizerBase,
+    prompt: Optional[str],
+    prompt_token_ids: List[int],
+    *,
+    image_token_id: int,
+    repeat_count: int = 1,
+    pad_token_left: Optional[int] = None,
+    pad_token_right: Optional[int] = None,
+) -> Tuple[Optional[str], List[int]]:
+    if prompt is None:
+        new_prompt = None
+    else:
+        image_token_str = tokenizer.decode(image_token_id)
+        pad_token_str_left = (None if pad_token_left is None else
+                              tokenizer.decode(pad_token_left))
+        pad_token_str_right = (None if pad_token_right is None else
+                               tokenizer.decode(pad_token_right))
+        replacement_str = "".join(
+            repeat_and_pad_token(
+                image_token_str,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_str_left,
+                pad_token_right=pad_token_str_right,
+            ))
+
+        # The image tokens are removed to be consistent with HuggingFace
+        new_prompt = prompt.replace(image_token_str, replacement_str, 1)
+
+    new_token_ids: List[int] = []
+    for i, token in enumerate(prompt_token_ids):
+        if token == image_token_id:
+            replacement_ids = repeat_and_pad_token(
+                image_token_id,
+                repeat_count=repeat_count,
+                pad_token_left=pad_token_left,
+                pad_token_right=pad_token_right,
+            )
+            new_token_ids.extend(replacement_ids)
+
+            # No need to further scan the list since we only replace once
+            new_token_ids.extend(prompt_token_ids[i + 1:])
+            break
         else:
-            image_feature_size = image_feature_size_override
-
-        new_prompt, new_token_ids = cls.repeat_and_pad_image_tokens(
-            tokenizer,
-            llm_inputs.get("prompt"),
-            llm_inputs["prompt_token_ids"],
-            image_token_id=image_token_id,
-            repeat_count=image_feature_size,
-        )
+            new_token_ids.append(token)
 
-        # NOTE: Create a defensive copy of the original inputs
-        return LLMInputs(prompt_token_ids=new_token_ids,
-                         prompt=new_prompt,
-                         multi_modal_data=multi_modal_data)
+    return new_prompt, new_token_ids
 
 
 class ImagePixelData(MultiModalData):

From 838aa9bb05045ee627fa1f81b553f3d71c20fcf7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 07:39:20 +0000
Subject: [PATCH 051/181] Remove some magic numbers

---
 vllm/model_executor/models/phi3v.py | 35 +++++++++++++++++------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 4e81ec75502c..25e2a36dff7b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -274,17 +274,35 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
+def _get_phi3v_image_feature_size(
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    h, w = input_height, input_width
+
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
+    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+
+
 def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    # TODO: How to get the max possible feature size?
+    dummy_height, dummy_width = 1344, 1008
+    image_feature_size = _get_phi3v_image_feature_size(
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
     seq_data = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         image_token_id=32044,
-        image_feature_size_override=1921,
+        image_feature_size_override=image_feature_size,
     )
     mm_data = dummy_pixel_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        image_width_override=1344,
-        image_height_override=1008,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
     )
 
     return seq_data, mm_data
@@ -324,17 +342,6 @@ def _calc_hd_transform_size(width, height, hd_num=16):
     return padded_width, padded_height
 
 
-def _get_phi3v_image_feature_size(
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    h, w = input_height, input_width
-
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
-    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
-
-
 def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     multi_modal_data = llm_inputs.get("multi_modal_data")
     if multi_modal_data is None or not isinstance(

From e7a55644df9fd96d8aefbd78c0c8e8fd77667031 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 10:26:09 +0000
Subject: [PATCH 052/181] Test multiscale inputs for LLaVA-NeXT

---
 tests/models/test_llava_next.py | 56 ++++++++++++++++++++++-----------
 vllm/multimodal/utils.py        |  6 ++++
 vllm/worker/model_runner.py     |  4 ++-
 3 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index b79a03c31bb1..1295801b3eb9 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -5,6 +5,8 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import rescale_image
 
 from ..conftest import IMAGE_FILES
 
@@ -25,11 +27,12 @@
 
 
 def iter_llava_next_configs(model_name: str):
+    # Need to use the max possible feature size for profile_run
     image_hw_to_feature_size = {
-        (336, 336): 1176,
+        (336, 336): 2928,
         (672, 672): 2928,
-        (1344, 336): 1944,
-        (336, 1344): 1890,
+        (1344, 336): 2928,
+        (336, 1344): 2928,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -74,9 +77,11 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+                model_and_config, dtype: str, max_tokens: int,
+                is_multiscale: bool) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -88,10 +93,21 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     """
     model_id, vlm_config = model_and_config
 
+    combinations = [
+        (rescale_image(hf_image, image_scale),
+         ImagePixelData(image=rescale_image(vllm_image.image, image_scale)),
+         prompt) for hf_image, vllm_image, prompt in zip(
+             hf_images, vllm_images, HF_IMAGE_PROMPTS)
+        for image_scale in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+    ]
+    prompt_inputs = [prompt for _, _, prompt in combinations]
+    hf_image_inputs = [hf_image for hf_image, _, _ in combinations]
+    vllm_image_inputs = [vllm_image for _, vllm_image, _ in combinations]
+
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+        hf_outputs = hf_model.generate_greedy(prompt_inputs,
                                               max_tokens,
-                                              images=hf_images)
+                                              images=hf_image_inputs)
 
     with vllm_runner(
             model_id,
@@ -101,15 +117,19 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             enforce_eager=True,
             **vlm_config.as_cli_args_dict(),
     ) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
+        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
                                                   max_tokens,
-                                                  images=vllm_images)
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+                                                  images=vllm_image_inputs)
+
+    for i in range(len(combinations)):
+        try:
+            hf_output_ids, hf_output_str = hf_outputs[i]
+            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                vllm_outputs[i], vlm_config, model_id)
+            assert hf_output_str == vllm_output_str, (
+                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+            assert hf_output_ids == vllm_output_ids, (
+                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        except Exception as e:
+            msg = f"Wrong output for combination {combinations[i]}"
+            raise AssertionError(msg) from e
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 55157e4b377f..5e2f0727ebe9 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -67,3 +67,9 @@ def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     """Load image from base64 format."""
     return Image.open(BytesIO(base64.b64decode(image)))
+
+
+def rescale_image(image: Image.Image, factor: float) -> Image.Image:
+    new_width = int(image.width * factor)
+    new_height = int(image.height * factor)
+    return image.resize((new_width, new_height))
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index c7b35f8db19d..db8d4fef28c1 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -823,7 +823,9 @@ def profile_run(self) -> None:
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
                 .dummy_data_for_profiling(model_config, seq_len)
-            assert len(seq_data.prompt_token_ids) == seq_len
+            assert len(seq_data.prompt_token_ids) == seq_len, (
+                f"Wrong number of tokens generated. Expected: {seq_len} "
+                f"but got: {len(seq_data.prompt_token_ids)}")
 
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),

From 36e8001311b6ca79839c5b8afc8d7136288951ab Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 25 Jun 2024 12:31:43 +0000
Subject: [PATCH 053/181] Handle multiscale inputs (different number of patches
 per batch) in LLaVA-NeXT

---
 vllm/model_executor/models/llava.py      | 24 +------
 vllm/model_executor/models/llava_next.py | 90 ++++++++++++------------
 vllm/model_executor/models/phi3v.py      | 14 ++--
 vllm/model_executor/models/vlm_base.py   | 36 ++++++++++
 vllm/multimodal/__init__.py              |  7 +-
 vllm/multimodal/base.py                  | 78 +++++++++++++++++---
 vllm/multimodal/image.py                 | 20 +++---
 vllm/worker/cpu_model_runner.py          | 24 +++----
 vllm/worker/embedding_model_runner.py    |  5 +-
 vllm/worker/model_runner.py              | 22 +++---
 10 files changed, 196 insertions(+), 124 deletions(-)

diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index a924214ad6b6..9f3901acee67 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -53,34 +53,16 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def merge_vision_embeddings(input_ids: torch.Tensor,
-                            inputs_embeds: torch.Tensor,
-                            vision_embeddings: torch.Tensor,
-                            image_token_id: int) -> torch.Tensor:
-    """In place merges in vision_embeddings with inputs_embeds."""
-    mask = (input_ids == image_token_id)
-
-    image_feature_size = vision_embeddings.shape[0] * vision_embeddings.shape[1]
-    if mask.sum() != image_feature_size:
-        raise ValueError(f"image_feature_size should be {image_feature_size}, "
-                         f"but found: {mask.sum()}")
-
-    inputs_embeds[mask] = vision_embeddings.view(image_feature_size,
-                                                 vision_embeddings.shape[-1])
-
-    return inputs_embeds
-
-
 class LlavaImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
     data: torch.Tensor
-    """Shape: (batch_size, num_channels, height, width)"""
+    """Shape: `(batch_size, num_channels, height, width)`"""
 
 
 class LlavaImageFeatureInputs(TypedDict):
     type: Literal["image_features"]
     data: torch.Tensor
-    """Shape: (batch_size, image_feature_size, hidden_size)"""
+    """Shape: `(batch_size, image_feature_size, hidden_size)`"""
 
 
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
@@ -325,7 +307,7 @@ def forward(
             vision_embeddings = self._process_image_input(image_input)
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
-            inputs_embeds = merge_vision_embeddings(
+            inputs_embeds = self.merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
                 self.vision_language_config.image_token_id)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 859b465e152c..08bf8485d1d2 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import Callable, Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -20,14 +20,14 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors, MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SamplerOutput
 
 from .clip import (dummy_feature_data_for_clip, dummy_pixel_data_for_clip,
                    dummy_seq_data_for_clip, get_clip_patch_grid_length,
                    input_processor_for_clip)
-from .llava import LlavaMultiModalProjector, merge_vision_embeddings
+from .llava import LlavaMultiModalProjector
 from .vlm_base import VisionLanguageModelBase
 
 logger = init_logger(__name__)
@@ -40,11 +40,15 @@
 
 class LlavaNextImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch.
+    """
 
     image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
+    """Shape: `(batch_size, 2)`"""
 
 
 def _get_llava_next_num_unpadded_features(
@@ -228,24 +232,6 @@ def __init__(self,
         self.image_newline = nn.Parameter(
             torch.empty(config.text_config.hidden_size))
 
-    def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
-        _, num_channels, _, _ = self.vision_language_config.image_input_shape
-
-        # Note that this is different from that of vLLM vision_language_config
-        # since the image is resized by the HuggingFace preprocessor
-        height = width = self.config.vision_config.image_size
-
-        if list(data.shape[2:]) != [num_channels, height, width]:
-            raise ValueError(
-                f"The expected image tensor shape is batch dimension plus "
-                f"num_patches plus {[num_channels, height, width]}. "
-                f"You supplied {data.shape}. "
-                f"If you are using vLLM's entrypoint, make sure your "
-                f"supplied image input is consistent with "
-                f"image_input_shape in engine args.")
-
-        return data
-
     def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
         if list(data.shape[1:]) != [2]:
             raise ValueError(
@@ -266,7 +252,7 @@ def _parse_and_validate_image_input(
             if pixel_values is None:
                 return None
 
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
@@ -276,7 +262,7 @@ def _parse_and_validate_image_input(
 
             return LlavaNextImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_image_pixels(pixel_values),
+                data=pixel_values,
                 image_sizes=self._validate_image_sizes(image_sizes),
             )
 
@@ -330,20 +316,20 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                 other_patch_embeds = patch_embeddings[1:]
 
                 # image_aspect_ratio == "anyres"
-                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
-                    (orig_width, orig_height),
+                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+                    (orig_height, orig_width),
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
                 other_patch_embeds = other_patch_embeds \
-                    .view(num_patch_width, num_patch_height, height, width, -1)
+                    .view(num_patch_height, num_patch_width, height, width, -1)
 
                 if "unpad" in strategy:
                     other_patch_embeds = other_patch_embeds \
                         .permute(4, 0, 2, 1, 3).contiguous() \
                         .flatten(1, 2).flatten(2, 3)
                     other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     image_size)
+                                                     (orig_height, orig_width))
                     other_patch_embeds = torch.cat((
                         other_patch_embeds,
                         self.image_newline[:, None, None] \
@@ -374,43 +360,55 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
         raise ValueError(f"Unexpected patch merge strategy: {strategy}")
 
     def _process_image_pixels(
-            self, inputs: LlavaNextImagePixelInputs) -> torch.Tensor:
+        self,
+        inputs: LlavaNextImagePixelInputs,
+        proj: Callable[[torch.Tensor], torch.Tensor],
+    ) -> BatchedTensors:
         assert self.vision_tower is not None
 
         pixel_values = inputs["data"]
 
-        b, num_patches, c, h, w = pixel_values.shape
-        stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+        if isinstance(pixel_values, torch.Tensor):
+            b, num_patches, c, h, w = pixel_values.shape
+            stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+            stacked_image_features = self._image_pixels_to_features(
+                self.vision_tower, stacked_pixel_values)
+            stacked_patch_embeddings = proj(stacked_image_features)
+
+            return stacked_patch_embeddings.view(b, num_patches,
+                                                 *stacked_patch_embeddings.shape[1:])
 
+        num_patches_per_batch = [v.shape[0] for v in pixel_values]
+        stacked_pixel_values = torch.cat(pixel_values)
         stacked_image_features = self._image_pixels_to_features(
             self.vision_tower, stacked_pixel_values)
 
-        return stacked_image_features.view(b, num_patches,
-                                           *stacked_image_features.shape[-2:])
+        return [
+            proj(image_features)
+            for image_features in torch.split(stacked_image_features,
+                                              num_patches_per_batch)
+        ]
 
     def _process_image_input(
-            self, image_input: LlavaNextImagePixelInputs) -> torch.Tensor:
-        image_features = self._process_image_pixels(image_input)
-
-        patch_embeddings = self.multi_modal_projector(image_features)
+            self, image_input: LlavaNextImagePixelInputs) -> BatchedTensors:
+        patch_embeddings = self._process_image_pixels(image_input,
+                                                      proj=self.multi_modal_projector)
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:
-            batch_size = image_input["data"].shape[0]
+            batch_size = len(image_input["data"])
             vision_config = self.config.vision_config
             default_width = default_height = vision_config.image_size
             image_sizes = torch.as_tensor([[default_width, default_height]
                                            for _ in range(batch_size)])
 
-        merged_patch_embeddings = [
+        return [
             self._merge_image_patch_embeddings(image_sizes[i],
-                                               patch_features,
+                                               patch_features_batch,
                                                strategy="spatial_unpad")
-            for i, patch_features in enumerate(patch_embeddings)
+            for i, patch_features_batch in enumerate(patch_embeddings)
         ]
 
-        return torch.stack(merged_patch_embeddings, dim=0)
-
     def forward(
         self,
         input_ids: torch.Tensor,
@@ -459,7 +457,7 @@ def forward(
             vision_embeddings = self._process_image_input(image_input)
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
-            inputs_embeds = merge_vision_embeddings(
+            inputs_embeds = self.merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
                 self.vision_language_config.image_token_id)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 25e2a36dff7b..8d2dbbf7de8e 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -33,7 +33,7 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensors
 from vllm.multimodal.image import (ImageFeatureData, ImagePixelData,
                                    cached_get_tokenizer)
 from vllm.sequence import SamplerOutput
@@ -267,11 +267,15 @@ def forward(self,
 
 class Phi3VImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+    data: BatchedTensors
+    """
+    Shape: `(batch_size, 1 + num_patches, num_channels, height, width)`
+
+    Note that `num_patches` may be different for each batch.
+    """
 
     image_sizes: torch.Tensor
-    """Shape: (batch_size, 2)"""
+    """Shape: `(batch_size, 2)`"""
 
 
 def _get_phi3v_image_feature_size(
@@ -441,7 +445,7 @@ def _parse_and_validate_image_input(
             if pixel_values is None:
                 return None
 
-            if not isinstance(pixel_values, torch.Tensor):
+            if not isinstance(pixel_values, (torch.Tensor, list)):
                 raise ValueError("Incorrect type of pixel values. "
                                  f"Got type: {type(pixel_values)}")
 
diff --git a/vllm/model_executor/models/vlm_base.py b/vllm/model_executor/models/vlm_base.py
index eb0aa96e50d5..b81a539f663d 100644
--- a/vllm/model_executor/models/vlm_base.py
+++ b/vllm/model_executor/models/vlm_base.py
@@ -1,6 +1,8 @@
+import torch
 from torch import nn
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal import BatchedTensors
 
 
 class VisionLanguageModelBase(nn.Module):
@@ -10,3 +12,37 @@ def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
         super().__init__()
 
         self.vision_language_config = vision_language_config
+
+    @classmethod
+    def merge_vision_embeddings(cls,
+                                input_ids: torch.Tensor,
+                                inputs_embeds: torch.Tensor,
+                                vision_embeddings: BatchedTensors,
+                                image_token_id: int) -> torch.Tensor:
+        """In place merges in vision_embeddings with inputs_embeds."""
+        mask = (input_ids == image_token_id)
+        num_expected_tokens = mask.sum()
+
+        if isinstance(vision_embeddings, torch.Tensor):
+            batch_size, batch_tokens, *_, embed_dim = vision_embeddings.shape
+            total_tokens = batch_size * batch_tokens
+            if num_expected_tokens != total_tokens:
+                expr = f"{batch_size} x {batch_tokens}"
+                raise ValueError(
+                    f"Attempted to assign {expr} = {total_tokens} "
+                    f"image tokens to {num_expected_tokens} placeholders")
+
+            inputs_embeds[mask] = vision_embeddings.view(total_tokens,
+                                                         embed_dim)
+        else:
+            size_per_batch = [t.shape[0] for t in vision_embeddings]
+            total_tokens = sum(size_per_batch)
+            if num_expected_tokens != total_tokens:
+                expr = ' + '.join(map(str, size_per_batch))
+                raise ValueError(
+                    f"Attempted to assign {expr} = {total_tokens} "
+                    f"image tokens to {num_expected_tokens} placeholders")
+            
+            inputs_embeds[mask] = torch.cat(vision_embeddings)
+
+        return inputs_embeds
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index dfd47f476d37..96ae403c3d09 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
-from .base import MultiModalData, MultiModalPlugin
+from .base import (BatchedTensors, MultiModalData, MultiModalInputs,
+                   MultiModalPlugin)
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -8,6 +9,6 @@
 """
 
 __all__ = [
-    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
-    "MultiModalRegistry"
+    "BatchedTensors", "MultiModalData", "MultiModalInputs", "MultiModalPlugin",
+    "MULTIMODAL_REGISTRY", "MultiModalRegistry"
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fb0cfe265253..efba2304abb7 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,15 +1,16 @@
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
-                    TypeVar)
+from collections import UserDict, defaultdict
+from typing import (Callable, Dict, Generic, List, Optional, Type, TypeVar,
+                    Union)
+
+import torch
+import torch.types
+from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    import torch
-    from torch import nn
-
 logger = init_logger(__name__)
 
 
@@ -30,10 +31,65 @@ class MultiModalData:
     pass
 
 
+BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+"""
+If each input tensor in the batch has the same size, this is a single batched
+tensor; otherwise, this is a list of tensors with one element per batch.
+"""
+
+
+class MultiModalInputs(UserDict[str, torch.Tensor]):
+    """
+    A dictionary that represents the keyword arguments to
+    :meth:`~torch.nn.Module.forward`.
+    """
+
+    @staticmethod
+    def try_concat(
+        tensors: List[torch.Tensor],
+        *,
+        device: torch.types.Device,
+    ) -> BatchedTensors:
+        unbatched_shape = tensors[0].shape[1:]
+
+        for tensor in tensors:
+            if tensor.shape[1:] != unbatched_shape:
+                return [tensor.squeeze(0).to(device=device)
+                        for tensor in tensors]
+
+        return torch.cat(tensors, dim=0).to(device=device)
+
+    @staticmethod
+    def batch(
+        inputs_list: List["MultiModalInputs"],
+        device: torch.types.Device,
+    ) -> Dict[str, BatchedTensors]:
+        """Batch multiple inputs together into a dictionary."""
+        if len(inputs_list) == 0:
+            return {}
+
+        keys = inputs_list[0].keys()
+
+        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+
+        for inputs in inputs_list:
+            if inputs.keys() != keys:
+                msg = f"Inputs do not share the same keys ({keys})"
+                raise ValueError(msg)
+
+            for k, v in inputs.items():
+                item_lists[k].append(v)
+
+        return {
+            k: MultiModalInputs.try_concat(item_list, device=device)
+            for k, item_list in item_lists.items()
+        }
+
+
 D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type["nn.Module"])
+N = TypeVar("N", bound=Type[nn.Module])
 
-MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, D], MultiModalInputs]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
@@ -51,7 +107,7 @@ class MultiModalPlugin(ABC, Generic[D]):
     """
 
     def __init__(self) -> None:
-        self._input_mappers: Dict[Type["nn.Module"],
+        self._input_mappers: Dict[Type[nn.Module],
                                   MultiModalInputMapper[D]] = {}
 
     @abstractmethod
@@ -64,7 +120,7 @@ def get_data_type(self) -> Type[D]:
 
     @abstractmethod
     def _default_input_mapper(self, ctx: InputContext,
-                              data: D) -> Dict[str, "torch.Tensor"]:
+                              data: D) -> MultiModalInputs:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
@@ -99,7 +155,7 @@ def wrapper(model_cls: N) -> N:
         return wrapper
 
     def map_input(self, model_config: ModelConfig,
-                  data: D) -> Dict[str, "torch.Tensor"]:
+                  data: D) -> MultiModalInputs:
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model, transforming the data into a dictionary of model inputs.
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index d9ec221563e3..e80536f44d47 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Dict, List, Optional, Tuple, Type, TypeVar, Union
+from typing import List, Optional, Tuple, Type, TypeVar, Union
 
 import torch
 from PIL import Image
@@ -11,7 +11,7 @@
 from vllm.transformers_utils.image_processor import get_image_processor
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalData, MultiModalInputs, MultiModalPlugin
 
 logger = init_logger(__name__)
 
@@ -130,7 +130,7 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
         )
 
     def _default_input_mapper(self, ctx: InputContext,
-                              data: ImagePixelData) -> Dict[str, torch.Tensor]:
+                              data: ImagePixelData) -> MultiModalInputs:
         model_config = ctx.model_config
         image = data.image
 
@@ -140,15 +140,18 @@ def _default_input_mapper(self, ctx: InputContext,
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
             try:
-                return image_processor.preprocess(image, return_tensors="pt") \
+                batch_data = image_processor \
+                    .preprocess(image, return_tensors="pt") \
                     .to(model_config.dtype).data
             except Exception:
                 logger.error("Failed to process image (%s)", image)
                 raise
+
+            return MultiModalInputs(batch_data)
         elif isinstance(image, torch.Tensor):
             pixel_values = image.to(model_config.dtype)
 
-            return {"pixel_values": pixel_values}
+            return MultiModalInputs({"pixel_values": pixel_values})
 
         raise TypeError(f"Invalid image type: {type(image)}")
 
@@ -175,10 +178,9 @@ class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
     def get_data_type(self) -> Type[ImageFeatureData]:
         return ImageFeatureData
 
-    def _default_input_mapper(
-            self, ctx: InputContext,
-            data: ImageFeatureData) -> Dict[str, torch.Tensor]:
+    def _default_input_mapper(self, ctx: InputContext,
+                              data: ImageFeatureData) -> MultiModalInputs:
         model_config = ctx.model_config
         image_features = data.image_features.to(model_config.dtype)
 
-        return {"image_features": image_features}
+        return MultiModalInputs({"image_features": image_features})
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 119489757428..7c09eba44b4b 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,5 +1,4 @@
-from collections import defaultdict
-from typing import Dict, List, Optional, Tuple
+from typing import List, Mapping, Optional, Tuple
 
 import torch
 from torch import nn
@@ -12,7 +11,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 
@@ -86,15 +86,14 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
-            str, torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -118,8 +117,7 @@ def _prepare_prompt(
             mm_data = seq_group_metadata.multi_modal_data
             if mm_data is not None:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                for k, v in mm_kwargs.items():
-                    multi_modal_kwargs_list[k].append(v)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
@@ -143,10 +141,8 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
 
         num_prompt_tokens = len(input_tokens)
 
@@ -262,7 +258,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[Dict[str, torch.Tensor]]]:
+               Optional[Mapping[str, BatchedTensors]]]:
         multi_modal_kwargs = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 465130d10e2f..cb9782232b72 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Optional, Set, Tuple
+from typing import Dict, List, Mapping, Optional, Set, Tuple
 
 import torch
 
@@ -11,6 +11,7 @@
 from vllm.lora.layers import LoRAMapping
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.pooling_metadata import PoolingMetadata
+from vllm.multimodal import BatchedTensors
 from vllm.pooling_params import PoolingParams
 from vllm.sequence import PoolerOutput, SequenceData, SequenceGroupMetadata
 from vllm.worker.model_runner import ModelRunner
@@ -90,7 +91,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
+               Set[LoRARequest], LoRAMapping, Mapping[str, BatchedTensors]]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index db8d4fef28c1..aa23742f7788 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,8 +1,7 @@
 import gc
 import time
 import warnings
-from collections import defaultdict
-from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
+from typing import Dict, List, Mapping, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
 import torch
@@ -22,7 +21,8 @@
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.model_loader.tensorizer import TensorizerConfig
-from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
@@ -49,7 +49,7 @@ class ModelInput(NamedTuple):
     query_lens: List[int]
     lora_mapping: Optional[LoRAMapping]
     lora_requests: Set[LoRARequest]
-    multi_modal_kwargs: Dict[str, torch.Tensor]
+    multi_modal_kwargs: Mapping[str, BatchedTensors]
     slot_mapping: torch.Tensor
     num_prefill_tokens: int
     num_decode_tokens: int
@@ -265,8 +265,7 @@ def _prepare_model_input(
         context_lens: List[int] = []
         query_lens: List[int] = []
         block_tables: List[List[int]] = []
-        multi_modal_kwargs_list: Dict[str,
-                                      List[torch.Tensor]] = defaultdict(list)
+        multi_modal_inputs_list: List[MultiModalInputs] = []
         decode_only = True
         num_prefills = 0
         num_prefill_tokens = 0
@@ -445,8 +444,7 @@ def _prepare_model_input(
                 if mm_data is not None:
                     # Process multi-modal data
                     mm_kwargs = self.multi_modal_input_mapper(mm_data)
-                    for k, v in mm_kwargs.items():
-                        multi_modal_kwargs_list[k].append(v)
+                    multi_modal_inputs_list.append(mm_kwargs)
 
                 if _is_block_tables_empty(seq_group_metadata.block_tables):
                     # During memory profiling, the block tables are not
@@ -631,10 +629,8 @@ def _prepare_model_input(
         else:
             lora_mapping = None
 
-        multi_modal_kwargs = {
-            k: torch.cat(v, dim=0).to(self.device)
-            for k, v in multi_modal_kwargs_list.items()
-        }
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
 
         return ModelInput(
             input_tokens=input_tokens_tensor,
@@ -655,7 +651,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
+               Set[LoRARequest], LoRAMapping, Mapping[str, BatchedTensors]]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.

From 39e6d42a5d4841c12f0739553f61353f2e24c21c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 02:51:33 +0000
Subject: [PATCH 054/181] Fix wrong feature size

---
 vllm/model_executor/models/llava_next.py | 30 +++++++++++++++---------
 vllm/model_executor/models/phi3v.py      |  6 ++++-
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 08bf8485d1d2..751e82a9b699 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -48,9 +48,14 @@ class LlavaNextImagePixelInputs(TypedDict):
     """
 
     image_sizes: NotRequired[torch.Tensor]
-    """Shape: `(batch_size, 2)`"""
+    """
+    Shape: `(batch_size, 2)`
+
+    This should be in `(height, width)` format.
+    """
 
 
+# Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
 def _get_llava_next_num_unpadded_features(
     height: int,
     width: int,
@@ -58,7 +63,6 @@ def _get_llava_next_num_unpadded_features(
     num_patch_height: int,
     num_patch_width: int,
 ) -> Tuple[int, int]:
-    # Taken from: https://github.com/huggingface/text-generation-inference/blob/799a193b109662743bed1b18a09af1fdcd508c8b/server/text_generation_server/models/vlm_causal_lm.py#L111
     current_height = npatches * num_patch_height
     current_width = npatches * num_patch_width
 
@@ -76,6 +80,7 @@ def _get_llava_next_num_unpadded_features(
     return (unpadded_features, newline_features)
 
 
+# Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
 def _get_llava_next_image_feature_size(
     hf_config: LlavaNextConfig,
     *,
@@ -91,7 +96,9 @@ def _get_llava_next_image_feature_size(
         )
         base_feature_size = num_patches * num_patches
 
-        num_patch_height, num_patch_width = get_anyres_image_grid_shape(
+        # Note: We follow the "wrong" width/height order
+        # [ref: PR huggingface/transformers#31588]
+        num_patch_width, num_patch_height = get_anyres_image_grid_shape(
             image_size=(input_height, input_width),
             grid_pinpoints=hf_config.image_grid_pinpoints,
             patch_size=vision_config.image_size,
@@ -302,7 +309,6 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
             return patch_embeddings.flatten(0, 1)
 
         if strategy.startswith("spatial"):
-            orig_width, orig_height = image_size
             height = width = self.config.vision_config.image_size \
                 // self.config.vision_config.patch_size
 
@@ -316,8 +322,10 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                 other_patch_embeds = patch_embeddings[1:]
 
                 # image_aspect_ratio == "anyres"
-                num_patch_height, num_patch_width = get_anyres_image_grid_shape(
-                    (orig_height, orig_width),
+                # Note: We follow the "wrong" width/height order
+                # [ref: PR huggingface/transformers#31588]
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                    image_size,
                     self.config.image_grid_pinpoints,
                     self.config.vision_config.image_size,
                 )
@@ -329,7 +337,7 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                         .permute(4, 0, 2, 1, 3).contiguous() \
                         .flatten(1, 2).flatten(2, 3)
                     other_patch_embeds = unpad_image(other_patch_embeds,
-                                                     (orig_height, orig_width))
+                                                     image_size)
                     other_patch_embeds = torch.cat((
                         other_patch_embeds,
                         self.image_newline[:, None, None] \
@@ -398,8 +406,8 @@ def _process_image_input(
         if image_sizes is None:
             batch_size = len(image_input["data"])
             vision_config = self.config.vision_config
-            default_width = default_height = vision_config.image_size
-            image_sizes = torch.as_tensor([[default_width, default_height]
+            default_height = default_width = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_height, default_width]
                                            for _ in range(batch_size)])
 
         return [
@@ -441,8 +449,8 @@ def forward(
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each grid patch for each input image.
-                Expects a batch with shape `[1, num_patches, 3, 336, 336]`.
-            image_sizes: The original `(width, height)` for each input image.
+                Expects a batch with shape `[1, num_patches, 3, h, w]`.
+            image_sizes: The original `(height, width)` for each input image.
                 Expects a batch with shape `[1, 2]`.
 
         See also:
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 8d2dbbf7de8e..52a959adb350 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -275,7 +275,11 @@ class Phi3VImagePixelInputs(TypedDict):
     """
 
     image_sizes: torch.Tensor
-    """Shape: `(batch_size, 2)`"""
+    """
+    Shape: `(batch_size, 2)`
+
+    This should be in `(height, width)` format.
+    """
 
 
 def _get_phi3v_image_feature_size(

From 0d7f18fe0c6c560754c339ba18d9277c78140183 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 02:54:25 +0000
Subject: [PATCH 055/181] Apply formatter

---
 vllm/model_executor/models/llava_next.py | 16 ++++++++--------
 vllm/model_executor/models/vlm_base.py   |  9 ++++-----
 vllm/multimodal/base.py                  |  5 +++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 751e82a9b699..6c6d94372a15 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,4 +1,5 @@
-from typing import Callable, Iterable, List, Literal, Optional, Tuple, TypedDict
+from typing import (Callable, Iterable, List, Literal, Optional, Tuple,
+                    TypedDict)
 
 import torch
 import torch.nn as nn
@@ -383,8 +384,8 @@ def _process_image_pixels(
                 self.vision_tower, stacked_pixel_values)
             stacked_patch_embeddings = proj(stacked_image_features)
 
-            return stacked_patch_embeddings.view(b, num_patches,
-                                                 *stacked_patch_embeddings.shape[1:])
+            return stacked_patch_embeddings.view(
+                b, num_patches, *stacked_patch_embeddings.shape[1:])
 
         num_patches_per_batch = [v.shape[0] for v in pixel_values]
         stacked_pixel_values = torch.cat(pixel_values)
@@ -392,15 +393,14 @@ def _process_image_pixels(
             self.vision_tower, stacked_pixel_values)
 
         return [
-            proj(image_features)
-            for image_features in torch.split(stacked_image_features,
-                                              num_patches_per_batch)
+            proj(image_features) for image_features in torch.split(
+                stacked_image_features, num_patches_per_batch)
         ]
 
     def _process_image_input(
             self, image_input: LlavaNextImagePixelInputs) -> BatchedTensors:
-        patch_embeddings = self._process_image_pixels(image_input,
-                                                      proj=self.multi_modal_projector)
+        patch_embeddings = self._process_image_pixels(
+            image_input, proj=self.multi_modal_projector)
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:
diff --git a/vllm/model_executor/models/vlm_base.py b/vllm/model_executor/models/vlm_base.py
index b81a539f663d..7ef0bbe7444a 100644
--- a/vllm/model_executor/models/vlm_base.py
+++ b/vllm/model_executor/models/vlm_base.py
@@ -14,8 +14,7 @@ def __init__(self, vision_language_config: VisionLanguageConfig) -> None:
         self.vision_language_config = vision_language_config
 
     @classmethod
-    def merge_vision_embeddings(cls,
-                                input_ids: torch.Tensor,
+    def merge_vision_embeddings(cls, input_ids: torch.Tensor,
                                 inputs_embeds: torch.Tensor,
                                 vision_embeddings: BatchedTensors,
                                 image_token_id: int) -> torch.Tensor:
@@ -32,8 +31,8 @@ def merge_vision_embeddings(cls,
                     f"Attempted to assign {expr} = {total_tokens} "
                     f"image tokens to {num_expected_tokens} placeholders")
 
-            inputs_embeds[mask] = vision_embeddings.view(total_tokens,
-                                                         embed_dim)
+            inputs_embeds[mask] = vision_embeddings.view(
+                total_tokens, embed_dim)
         else:
             size_per_batch = [t.shape[0] for t in vision_embeddings]
             total_tokens = sum(size_per_batch)
@@ -42,7 +41,7 @@ def merge_vision_embeddings(cls,
                 raise ValueError(
                     f"Attempted to assign {expr} = {total_tokens} "
                     f"image tokens to {num_expected_tokens} placeholders")
-            
+
             inputs_embeds[mask] = torch.cat(vision_embeddings)
 
         return inputs_embeds
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index efba2304abb7..7b221c00242f 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -54,8 +54,9 @@ def try_concat(
 
         for tensor in tensors:
             if tensor.shape[1:] != unbatched_shape:
-                return [tensor.squeeze(0).to(device=device)
-                        for tensor in tensors]
+                return [
+                    tensor.squeeze(0).to(device=device) for tensor in tensors
+                ]
 
         return torch.cat(tensors, dim=0).to(device=device)
 

From 6d02491aa9aaeac66b4df21e1c5f282ce6e15dd2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 03:53:18 +0000
Subject: [PATCH 056/181] Revert max_tokens

---
 tests/models/test_llava_next.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 1295801b3eb9..7fc5061cece4 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -77,7 +77,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [64])
+@pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                 model_and_config, dtype: str, max_tokens: int,

From 76ddea411731b173858f4b29deb925f5a0ab1e75 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 04:45:21 +0000
Subject: [PATCH 057/181] Add more tests for input mapper

---
 tests/models/test_llava_next.py | 20 ++++++++++----------
 tests/multimodal/test_mapper.py | 11 +++++++++--
 vllm/multimodal/utils.py        |  7 ++++---
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 7fc5061cece4..48383c9e2528 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -6,7 +6,7 @@
 
 from vllm.config import VisionLanguageConfig
 from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import rescale_image
+from vllm.multimodal.utils import rescale_image_size
 
 from ..conftest import IMAGE_FILES
 
@@ -93,16 +93,16 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     """
     model_id, vlm_config = model_and_config
 
-    combinations = [
-        (rescale_image(hf_image, image_scale),
-         ImagePixelData(image=rescale_image(vllm_image.image, image_scale)),
+    image_inputs = [
+        (rescale_image_size(hf_image, factor),
+         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
          prompt) for hf_image, vllm_image, prompt in zip(
              hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for image_scale in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
     ]
-    prompt_inputs = [prompt for _, _, prompt in combinations]
-    hf_image_inputs = [hf_image for hf_image, _, _ in combinations]
-    vllm_image_inputs = [vllm_image for _, vllm_image, _ in combinations]
+    prompt_inputs = [prompt for _, _, prompt in image_inputs]
+    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
+    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(prompt_inputs,
@@ -121,7 +121,7 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
                                                   max_tokens,
                                                   images=vllm_image_inputs)
 
-    for i in range(len(combinations)):
+    for i in range(len(image_inputs)):
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
             vllm_output_ids, vllm_output_str = vllm_to_hf_output(
@@ -131,5 +131,5 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except Exception as e:
-            msg = f"Wrong output for combination {combinations[i]}"
+            msg = f"Wrong output for combination {image_inputs[i]}"
             raise AssertionError(msg) from e
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 0defe2b9a0f4..5b07ba18b409 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -5,12 +5,14 @@
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import rescale_image_size
 
 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(hf_images, dtype):
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_clip_image_processor(hf_images, dtype, size_factor):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -36,6 +38,8 @@ def test_clip_image_processor(hf_images, dtype):
     )
 
     for image in hf_images:
+        image = rescale_image_size(image, size_factor)
+
         hf_result = hf_processor.preprocess(
             image,
             return_tensors="pt",
@@ -55,7 +59,8 @@ def test_clip_image_processor(hf_images, dtype):
 
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(hf_images, dtype):
+@pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
+def test_llava_next_image_processor(hf_images, dtype, size_factor):
     MODEL_NAME = "llava-hf/llava-v1.6-vicuna-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
@@ -81,6 +86,8 @@ def test_llava_next_image_processor(hf_images, dtype):
     )
 
     for image in hf_images:
+        image = rescale_image_size(image, size_factor)
+
         hf_result = hf_processor.preprocess(
             image,
             return_tensors="pt",
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 421a3aa9f278..e0c7cae00279 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -75,7 +75,8 @@ def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     return Image.open(BytesIO(base64.b64decode(image)))
 
 
-def rescale_image(image: Image.Image, factor: float) -> Image.Image:
-    new_width = int(image.width * factor)
-    new_height = int(image.height * factor)
+def rescale_image_size(image: Image.Image, size_factor: float) -> Image.Image:
+    """Rescale the dimensions of an image by a constant factor."""
+    new_width = int(image.width * size_factor)
+    new_height = int(image.height * size_factor)
     return image.resize((new_width, new_height))

From 4b20e66dae9727b113b99a13bbc9c4b118e80c40 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 04:48:44 +0000
Subject: [PATCH 058/181] Sanity check: Also test multiscale inputs for
 LLaVA-1.5 - Remove test for feature size as it will be unsupported after
 #5852 anyway

---
 tests/models/test_llava.py | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 34f02799f68c..a91b9c278a6e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -4,6 +4,8 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import rescale_image_size
 
 from ..conftest import IMAGE_FILES
 
@@ -26,7 +28,6 @@ def iter_llava_configs(model_name: str):
     for (h, w), f in image_hw_to_feature_size.items():
         for input_type, input_shape in [
             (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
         ]:
             yield (model_name,
                    VisionLanguageConfig(image_input_type=input_type,
@@ -69,8 +70,10 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
-                model_and_config, dtype: str, max_tokens: int) -> None:
+                model_and_config, dtype: str, max_tokens: int,
+                is_multiscale: bool) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -82,18 +85,29 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     """
     model_id, vlm_config = model_and_config
 
+    image_inputs = [
+        (rescale_image_size(hf_image, factor),
+         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
+         prompt) for hf_image, vllm_image, prompt in zip(
+             hf_images, vllm_images, HF_IMAGE_PROMPTS)
+        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+    ]
+    prompt_inputs = [prompt for _, _, prompt in image_inputs]
+    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
+    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
+
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+        hf_outputs = hf_model.generate_greedy(prompt_inputs,
                                               max_tokens,
-                                              images=hf_images)
+                                              images=hf_image_inputs)
 
     with vllm_runner(model_id,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
+        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
                                                   max_tokens,
-                                                  images=vllm_images)
+                                                  images=vllm_image_inputs)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]

From 784af1a7f3d72ec498373af94e9db5a9a611210d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 05:10:32 +0000
Subject: [PATCH 059/181] Do not auto-convert image dtype to model's dtype -
 This is not done in HF, so we follow them such that consistency tests can
 pass

---
 vllm/multimodal/image.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index e80536f44d47..33b07f33714d 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -142,14 +142,14 @@ def _default_input_mapper(self, ctx: InputContext,
             try:
                 batch_data = image_processor \
                     .preprocess(image, return_tensors="pt") \
-                    .to(model_config.dtype).data
+                    .data
             except Exception:
                 logger.error("Failed to process image (%s)", image)
                 raise
 
             return MultiModalInputs(batch_data)
         elif isinstance(image, torch.Tensor):
-            pixel_values = image.to(model_config.dtype)
+            pixel_values = image
 
             return MultiModalInputs({"pixel_values": pixel_values})
 
@@ -180,7 +180,6 @@ def get_data_type(self) -> Type[ImageFeatureData]:
 
     def _default_input_mapper(self, ctx: InputContext,
                               data: ImageFeatureData) -> MultiModalInputs:
-        model_config = ctx.model_config
-        image_features = data.image_features.to(model_config.dtype)
+        image_features = data.image_features
 
         return MultiModalInputs({"image_features": image_features})

From 8e5fb128d4b641470787719ed3eaa812c04464c3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 05:38:27 +0000
Subject: [PATCH 060/181] Update prompts

---
 tests/models/test_llava.py      | 6 +++---
 tests/models/test_llava_next.py | 7 +++----
 tests/models/test_phi3v.py      | 1 -
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a91b9c278a6e..c978ed9d8377 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -13,8 +13,8 @@
 
 # The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
-    "<image>\nUSER: What is the season?\nASSISTANT:",
+    "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
+    "USER: <image>\nWhat is the season?\nASSISTANT:",
 ]
 
 assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
@@ -61,7 +61,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
     hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, "")
+        .replace(image_token_str * vlm_config.image_feature_size, " ")
 
     return hf_output_ids, hf_output_str
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 48383c9e2528..ac5bf8c76fa8 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -17,10 +17,9 @@
     "The assistant gives helpful, detailed, and polite answers to the human's "
     "questions.")
 
-# The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
-    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
-    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
+    f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
+    f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
 ]
 
 assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
@@ -70,7 +69,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = re.sub(fr"({image_token_str})+", " ", output_str)
+    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
 
     return hf_output_ids, hf_output_str
 
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 11d14067d580..8a4b4f827424 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -10,7 +10,6 @@
 
 pytestmark = pytest.mark.vlm
 
-# The image token is placed before "user" on purpose so that the test can pass
 HF_IMAGE_PROMPTS = [
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
     "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",

From 865be7a0aeab34f48edb4d474c3642004379153c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 08:47:08 +0000
Subject: [PATCH 061/181] Fix mapper tests w.r.t. dtype change

---
 tests/multimodal/test_mapper.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 3864736554cc..22d454e64779 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -7,8 +7,6 @@
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import rescale_image_size
 
-from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
-
 
 @pytest.mark.parametrize("dtype", ["half", "float"])
 @pytest.mark.parametrize("size_factor", [0.25, 0.5, 1.0])
@@ -43,7 +41,7 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
         hf_result = hf_processor.preprocess(
             image,
             return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
             ImagePixelData(image),
@@ -91,7 +89,7 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
         hf_result = hf_processor.preprocess(
             image,
             return_tensors="pt",
-        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
             ImagePixelData(image),

From 9e82a26d4516ee61602713e189791cc94fb2b1cd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 08:51:42 +0000
Subject: [PATCH 062/181] Clarify docs and add todo

---
 vllm/inputs/registry.py | 9 +++++++--
 vllm/multimodal/base.py | 2 ++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 1abc51d1e308..5585df76d083 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -96,7 +96,8 @@ def _default_dummy_data_factory(
         seq_len: int,
     ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
         """
-        Create dummy data to be inputted into the model.
+        The default dummy data factory represents the longest possible text
+        that can be inputted to the model.
 
         Note:
             :data:`InputProcessor` is not applied to the dummy data.
@@ -145,7 +146,7 @@ def dummy_data_for_profiling(self, model_config: "ModelConfig",
 
     def _default_input_processor(self, ctx: InputContext,
                                  inputs: LLMInputs) -> LLMInputs:
-        """Preprocess the inputs to the model."""
+        """The default input processor is a no-op."""
         return inputs
 
     def register_input_processor(self, processor: InputProcessor):
@@ -154,6 +155,8 @@ def register_input_processor(self, processor: InputProcessor):
 
         The provided function is invoked on each input to the model. This
         happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+
+        TODO: Add guide [ref: PR #5276]
         """
 
         def wrapper(model_cls: N) -> N:
@@ -177,6 +180,8 @@ def process_input(self, model_config: "ModelConfig",
         The model is identified by ``model_config``. ``vlm_config`` is
         for compatibility purposes and may be merged into ``model_config``
         in the near future.
+
+        TODO: Add guide [ref: PR #5276]
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index fb0cfe265253..c08f4602675c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -103,6 +103,8 @@ def map_input(self, model_config: ModelConfig,
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model, transforming the data into a dictionary of model inputs.
+
+        TODO: Add guide [ref: PR #5276]
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture

From a4733f99262e186c9ab631cd293a8b3b79c097ce Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 08:58:51 +0000
Subject: [PATCH 063/181] Remove TODO since vision config will be removed soon

---
 tests/models/test_llava.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index a4539af4a198..f9f366065b90 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -65,7 +65,6 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     return hf_output_ids, hf_output_str
 
 
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])

From 6b19e6c1cd1dc7bf8566df0ccdf9ce3fcdbbf615 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 09:05:04 +0000
Subject: [PATCH 064/181] Expand docs

---
 vllm/inputs/registry.py | 12 ++++++++----
 vllm/multimodal/base.py |  2 ++
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 5585df76d083..b8d73b002a41 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -134,7 +134,13 @@ def wrapper(model_cls: N) -> N:
 
     def dummy_data_for_profiling(self, model_config: "ModelConfig",
                                  seq_len: int):
-        """Create dummy data for memory profiling."""
+        """
+        Create dummy data for profiling the memory usage of a model.
+
+        The model is identified by ``model_config``.
+
+        TODO: Add guide [ref: PR #5276]
+        """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
 
@@ -177,9 +183,7 @@ def process_input(self, model_config: "ModelConfig",
         """
         Apply an input processor to an instance of model inputs.
 
-        The model is identified by ``model_config``. ``vlm_config`` is
-        for compatibility purposes and may be merged into ``model_config``
-        in the near future.
+        The model is identified by ``model_config``.
 
         TODO: Add guide [ref: PR #5276]
         """
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c08f4602675c..2ed4706e982f 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -104,6 +104,8 @@ def map_input(self, model_config: ModelConfig,
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model, transforming the data into a dictionary of model inputs.
 
+        The model is identified by ``model_config``.
+
         TODO: Add guide [ref: PR #5276]
         """
         # Avoid circular import

From f4516682c12ba74dafc6062d5105dbd643a1329d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 09:06:23 +0000
Subject: [PATCH 065/181] Add ref

---
 docs/source/dev/input_processing/model_inputs_index.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index f1f929c92332..4dec6492cc34 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -1,3 +1,5 @@
+.. _input_processing:
+
 Input Processing
 ================
 

From 3d7b7955b74e26af4642cc60d16e780c336d0511 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 09:30:39 +0000
Subject: [PATCH 066/181] Update docs

---
 .../source/dev/multimodal/adding_multimodal_model.rst | 11 ++++++++---
 vllm/multimodal/base.py                               |  6 +++++-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index 4a0010d47ba3..c09452fee587 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -70,7 +70,10 @@ In such cases, you can define your own dummy data by registering a factory metho
     + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
     class YourModelForImage2Seq(nn.Module):
 
-Refer to :class:`vllm.multimodal.image.DummyImageDataFactories` for some examples of dummy data factories.
+Here are some examples:
+
+- Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
 
 
 3. (Optional) Register input processor
@@ -89,6 +92,8 @@ You can register input processors via :meth:`vllm.inputs.INPUT_REGISTRY.register
     + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
     class YourModelForImage2Seq(nn.Module):
 
-A common use case of input processors is inserting extra image tokens to leverage the vLLM framework for attention mask generation.
-More details can be found in :class:`vllm.multimodal.image.ImageInputProcessors`.
+A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
+Here are some examples:
 
+- Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
+- Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index aa7563a3a89d..4632c30dfb56 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -139,6 +139,9 @@ def register_input_mapper(
         this plugin (see :meth:`get_data_type`), the provided function is
         invoked to transform the data into a dictionary of model inputs.
         If `None` is provided, then the default input mapper is used instead.
+
+        See also:
+            :ref:`adding_a_new_multimodal_model`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -163,7 +166,8 @@ def map_input(self, model_config: ModelConfig,
 
         The model is identified by ``model_config``.
 
-        TODO: Add guide [ref: PR #5276]
+        See also:
+            :ref:`adding_a_new_multimodal_model`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture

From 1abb8a7480d5ee0f490e7d52fd6c6e1c7a07880b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 09:52:43 +0000
Subject: [PATCH 067/181] Add docs

---
 .../input_processing/input_processing_pipeline.rst    | 11 +++++++++++
 .../dev/input_processing/model_inputs_index.rst       | 10 +++++++---
 docs/source/dev/multimodal/multimodal_index.rst       |  4 ----
 vllm/inputs/registry.py                               |  6 ++++--
 vllm/multimodal/base.py                               |  3 +++
 5 files changed, 25 insertions(+), 9 deletions(-)
 create mode 100644 docs/source/dev/input_processing/input_processing_pipeline.rst

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
new file mode 100644
index 000000000000..d7f8ffeda71a
--- /dev/null
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -0,0 +1,11 @@
+.. _input_processing_pipeline:
+
+Input Processing Pipeline
+=========================
+
+1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+2. Tokenize the data if necessary.
+3. Process the inputs using :meth:`~vllm.inputs.registry.InputRegistry.process_input`.
+4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunner`.
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index 4dec6492cc34..e9d5bf176a3c 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -8,9 +8,13 @@ Input Processing
 vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
 in :class:`~vllm.LLMEngine` before they are passed to model executors.
 
-.. contents::
-   :local:
-   :backlinks: none
+Guides
+++++++
+
+.. toctree::
+   :maxdepth: 1
+
+   input_processing_pipeline
 
 Module Contents
 +++++++++++++++
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 719d6e12ddfd..f6fdfc1debff 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -12,10 +12,6 @@ By default, vLLM models do not support multi-modal inputs. To enable multi-modal
 you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
 as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
 
-.. contents::
-   :local:
-   :backlinks: none
-
 Module Contents
 +++++++++++++++
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index b8d73b002a41..8f4e108b8cca 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -162,7 +162,8 @@ def register_input_processor(self, processor: InputProcessor):
         The provided function is invoked on each input to the model. This
         happens before :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
 
-        TODO: Add guide [ref: PR #5276]
+        See also:
+            :ref:`input_processing_pipeline`
         """
 
         def wrapper(model_cls: N) -> N:
@@ -185,7 +186,8 @@ def process_input(self, model_config: "ModelConfig",
 
         The model is identified by ``model_config``.
 
-        TODO: Add guide [ref: PR #5276]
+        See also:
+            :ref:`input_processing_pipeline`
         """
         # Avoid circular import
         from vllm.model_executor.model_loader import get_model_architecture
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 2ed4706e982f..b2e2c516895c 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -82,6 +82,9 @@ def register_input_mapper(
         this plugin (see :meth:`get_data_type`), the provided function is
         invoked to transform the data into a dictionary of model inputs.
         If `None` is provided, then the default input mapper is used instead.
+
+        See also:
+            :ref:`input_processing_pipeline`
         """
 
         def wrapper(model_cls: N) -> N:

From 698830f7e5cd12ac7c76baa8e1afdf30cd5725bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 09:56:49 +0000
Subject: [PATCH 068/181] Fix name

---
 docs/source/dev/input_processing/input_processing_pipeline.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
index d7f8ffeda71a..68b737f13c70 100644
--- a/docs/source/dev/input_processing/input_processing_pipeline.rst
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -7,5 +7,5 @@ Input Processing Pipeline
 2. Tokenize the data if necessary.
 3. Process the inputs using :meth:`~vllm.inputs.registry.InputRegistry.process_input`.
 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
-5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunner`.
+5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.

From 334b1a96a4080ad0b95fcbe720faab9e1636afbc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:00:39 +0000
Subject: [PATCH 069/181] Add `MultiModalInputs` to docs

---
 docs/source/dev/multimodal/multimodal_index.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index aad2e679a152..c68fe8dbb953 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -41,6 +41,10 @@ Base Classes
     :members:
     :show-inheritance:
 
+.. autoclass:: vllm.multimodal.MultiModalInputs
+    :members:
+    :show-inheritance:
+
 .. autoclass:: vllm.multimodal.MultiModalPlugin
     :members:
     :show-inheritance:

From 36ab12d62430383023ae96cfff00cb3b20674806 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:04:43 +0000
Subject: [PATCH 070/181] Fix and add links

---
 .../source/dev/input_processing/input_processing_pipeline.rst | 4 ++--
 vllm/inputs/__init__.py                                       | 3 +++
 vllm/multimodal/__init__.py                                   | 3 +++
 vllm/multimodal/base.py                                       | 2 +-
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
index 68b737f13c70..80fcc379f382 100644
--- a/docs/source/dev/input_processing/input_processing_pipeline.rst
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -5,7 +5,7 @@ Input Processing Pipeline
 
 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
 2. Tokenize the data if necessary.
-3. Process the inputs using :meth:`~vllm.inputs.registry.InputRegistry.process_input`.
+3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
-6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`~vllm.multimodal.MultiModalRegistry.map_input`.
+6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
diff --git a/vllm/inputs/__init__.py b/vllm/inputs/__init__.py
index 637c22394c89..d09415696295 100644
--- a/vllm/inputs/__init__.py
+++ b/vllm/inputs/__init__.py
@@ -7,6 +7,9 @@
 """
 The global :class:`~InputRegistry` which is used by :class:`~vllm.LLMEngine`
 to dispatch data processing according to the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index dfd47f476d37..20bd87b8c443 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -5,6 +5,9 @@
 """
 The global :class:`~MultiModalRegistry` is used by model runners to
 dispatch data processing according to its modality and the target model.
+
+See also:
+    :ref:`input_processing_pipeline`
 """
 
 __all__ = [
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index b2e2c516895c..d47cdd559ad8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -24,7 +24,7 @@ class MultiModalData:
 
     Finally, register the new plugin to
     :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.register_input` for
+    This enables models to call :meth:`MultiModalRegistry.map_input` for
     the new modality.
     """
     pass

From c30342174c849da7f4c6c1abb2e2a94e2f251f89 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:08:10 +0000
Subject: [PATCH 071/181] Fix `is_multiscale` not provided anymore

---
 tests/models/test_llava_next.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 2b6e68d8d8eb..a7fcfbd38190 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -77,6 +77,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 dtype: str, max_tokens: int, is_multiscale: bool) -> None:
     """Inference result should be the same between hf and vllm.

From 0a0c0e353cb98c9ca02c26eecb3c166b69d746cd Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:11:33 +0000
Subject: [PATCH 072/181] Also test multiscale input for phi3v

---
 tests/models/test_llava.py      | 18 ++++++++------
 tests/models/test_llava_next.py |  2 +-
 tests/models/test_phi3v.py      | 42 +++++++++++++++++++++++----------
 3 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f9f366065b90..c6dd49925bd7 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -108,10 +108,14 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                                                   images=vllm_image_inputs)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        try:
+            hf_output_ids, hf_output_str = hf_outputs[i]
+            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                vllm_outputs[i], vlm_config, model_id)
+            assert hf_output_str == vllm_output_str, (
+                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+            assert hf_output_ids == vllm_output_ids, (
+                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        except Exception as e:
+            msg = f"Wrong output for inputs {image_inputs[i]}"
+            raise AssertionError(msg) from e
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index a7fcfbd38190..8ce6bd25a8c5 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -131,5 +131,5 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except Exception as e:
-            msg = f"Wrong output for combination {image_inputs[i]}"
+            msg = f"Wrong output for inputs {image_inputs[i]}"
             raise AssertionError(msg) from e
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 81f0dc2291c5..1c2784f4770f 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -4,6 +4,8 @@
 import pytest
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS
@@ -80,8 +82,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int) -> None:
+                dtype: str, max_tokens: int, is_multiscale: bool) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -95,14 +98,25 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
+    image_inputs = [
+        (rescale_image_size(hf_image, factor),
+         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
+         prompt) for hf_image, vllm_image, prompt in zip(
+             hf_images, vllm_images, HF_IMAGE_PROMPTS)
+        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+    ]
+    prompt_inputs = [prompt for _, _, prompt in image_inputs]
+    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
+    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
+
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
         hf_outputs = hf_model.generate_greedy(
-            HF_IMAGE_PROMPTS,
+            prompt_inputs,
             max_tokens,
-            images=hf_images,
+            images=hf_image_inputs,
             eos_token_id=hf_model.processor.tokenizer.eos_token_id)
 
     with vllm_runner(model_id,
@@ -110,15 +124,19 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(HF_IMAGE_PROMPTS,
+        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
                                                   max_tokens,
-                                                  images=vllm_images)
+                                                  images=vllm_image_inputs)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
-        hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-            vllm_outputs[i], vlm_config, model_id)
-        assert hf_output_str == vllm_output_str, (
-            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-        assert hf_output_ids == vllm_output_ids, (
-            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        try:
+            hf_output_ids, hf_output_str = hf_outputs[i]
+            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                vllm_outputs[i], vlm_config, model_id)
+            assert hf_output_str == vllm_output_str, (
+                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+            assert hf_output_ids == vllm_output_ids, (
+                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+        except Exception as e:
+            msg = f"Wrong output for inputs {image_inputs[i]}"
+            raise AssertionError(msg) from e

From 60517a7ee1e1e5acf4928a465414cf190f3603a9 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:13:44 +0000
Subject: [PATCH 073/181] Revert max_tokens for phi3v as numerical error still
 persists

---
 tests/models/test_phi3v.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1c2784f4770f..2c1612a5c02a 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -81,7 +81,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 # difference for longer context (max_tokens=128) and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("max_tokens", [8])
 @pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 dtype: str, max_tokens: int, is_multiscale: bool) -> None:

From 57df43476b1fd394352e9fcad207ec11ae5c73ff Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 10:14:48 +0000
Subject: [PATCH 074/181] Improve error message

---
 tests/models/test_llava.py      | 5 +++--
 tests/models/test_llava_next.py | 5 +++--
 tests/models/test_phi3v.py      | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c6dd49925bd7..61e67e55f53e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -83,12 +83,13 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
+    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
          prompt) for hf_image, vllm_image, prompt in zip(
              hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+        for factor in size_factors
     ]
     prompt_inputs = [prompt for _, _, prompt in image_inputs]
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
@@ -117,5 +118,5 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except Exception as e:
-            msg = f"Wrong output for inputs {image_inputs[i]}"
+            msg = f"Wrong output for size factor {size_factors[i]}"
             raise AssertionError(msg) from e
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 8ce6bd25a8c5..6ef113f34504 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -93,12 +93,13 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
+    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
          prompt) for hf_image, vllm_image, prompt in zip(
              hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+        for factor in size_factors
     ]
     prompt_inputs = [prompt for _, _, prompt in image_inputs]
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
@@ -131,5 +132,5 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except Exception as e:
-            msg = f"Wrong output for inputs {image_inputs[i]}"
+            msg = f"Wrong output for size factor {size_factors[i]}"
             raise AssertionError(msg) from e
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 2c1612a5c02a..b46a8f785166 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -98,12 +98,13 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
+    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
          prompt) for hf_image, vllm_image, prompt in zip(
              hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in ((0.25, 0.5, 1.0) if is_multiscale else (1, ))
+        for factor in size_factors
     ]
     prompt_inputs = [prompt for _, _, prompt in image_inputs]
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
@@ -138,5 +139,5 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except Exception as e:
-            msg = f"Wrong output for inputs {image_inputs[i]}"
+            msg = f"Wrong output for size factor {size_factors[i]}"
             raise AssertionError(msg) from e

From ffe0675ba159bb9ad28c6da69aab7206e9361136 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 13:01:51 +0000
Subject: [PATCH 075/181] Log the full output for easier reference

---
 .buildkite/test-pipeline.yaml |  2 +-
 tests/models/test_phi3v.py    | 91 ++++++++++++++++++++++-------------
 2 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 10cfe35d85be..ac0636e00539 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -118,7 +118,7 @@ steps:
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models -m vlm
+    - pytest -v -s -rx models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index b46a8f785166..0ffd555df859 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,5 +1,5 @@
 import re
-from typing import List, Tuple
+from typing import Dict, List, Tuple
 
 import pytest
 
@@ -77,11 +77,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
     target_dtype = "bfloat16"
 
 
-# Since we use _attn_implementation="eager" for hf_runner, there is numeric
-# difference for longer context (max_tokens=128) and test can't pass
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", [target_dtype])
-@pytest.mark.parametrize("max_tokens", [8])
+@pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 dtype: str, max_tokens: int, is_multiscale: bool) -> None:
@@ -110,34 +108,59 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
     vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
 
-    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-    hf_model_kwargs = {"_attn_implementation": "eager"}
-    with hf_runner(model_id, dtype=dtype,
-                   model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            prompt_inputs,
-            max_tokens,
-            images=hf_image_inputs,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id)
-
-    with vllm_runner(model_id,
-                     max_model_len=2048,
-                     dtype=dtype,
-                     enforce_eager=True,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
-                                                  max_tokens,
-                                                  images=vllm_image_inputs)
-
-    for i in range(len(HF_IMAGE_PROMPTS)):
+    def run_test(max_tokens: int):
+        # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+        hf_model_kwargs = {"_attn_implementation": "eager"}
+        with hf_runner(model_id, dtype=dtype,
+                       model_kwargs=hf_model_kwargs) as hf_model:
+            hf_outputs = hf_model.generate_greedy(
+                prompt_inputs,
+                max_tokens,
+                images=hf_image_inputs,
+                eos_token_id=hf_model.processor.tokenizer.eos_token_id)
+
+        with vllm_runner(model_id,
+                         max_model_len=2048,
+                         dtype=dtype,
+                         enforce_eager=True,
+                         **vlm_config.as_cli_args_dict()) as vllm_model:
+            vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
+                                                      max_tokens,
+                                                      images=vllm_image_inputs)
+
+        for i in range(len(HF_IMAGE_PROMPTS)):
+            try:
+                hf_output_ids, hf_output_str = hf_outputs[i]
+                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                    vllm_outputs[i], vlm_config, model_id)
+                assert hf_output_str == vllm_output_str, (
+                    f"Test{i}:\nHF: {hf_output_str!r}\n"
+                    f"vLLM: {vllm_output_str!r}")
+                assert hf_output_ids == vllm_output_ids, (
+                    f"Test{i}:\nHF: {hf_output_ids}\n"
+                    f"vLLM: {vllm_output_ids}")
+            except Exception as e:
+                msg = f"Wrong output for size factor {size_factors[i]}"
+                raise AssertionError(msg) from e
+
+    # Since we use _attn_implementation="eager" for hf_runner, there is numeric
+    # difference for longer context (max_tokens=128) and test can't pass
+    fallback_tokens = max_tokens
+    assert_fails: Dict[int, AssertionError] = {}
+    while True:
         try:
-            hf_output_ids, hf_output_str = hf_outputs[i]
-            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                vllm_outputs[i], vlm_config, model_id)
-            assert hf_output_str == vllm_output_str, (
-                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-            assert hf_output_ids == vllm_output_ids, (
-                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except Exception as e:
-            msg = f"Wrong output for size factor {size_factors[i]}"
-            raise AssertionError(msg) from e
+            run_test(fallback_tokens)
+        except AssertionError as e:
+            if fallback_tokens == 1:
+                raise
+
+            assert_fails[fallback_tokens] = e
+            fallback_tokens //= 2
+        else:
+            if assert_fails:
+                pytest.xfail("Phi-3-Vision test only passed when max_tokens="
+                             f"{fallback_tokens} (instead of {max_tokens}). "
+                             "Errors encountered for each max_tokens value: "
+                             f"{assert_fails}")
+
+            return

From 4f7b21091d50e320a6481d8329bb44239829492d Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Tue, 25 Jun 2024 16:44:34 -0700
Subject: [PATCH 076/181] [VLM] Remove support for pixel_values and
 image_features.

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 .buildkite/download-images.sh              |   4 -
 docs/source/models/vlm.rst                 |   6 +-
 examples/llava_example.py                  |  55 ++-------
 examples/phi3v_example.py                  |   7 +-
 tests/conftest.py                          |  16 +--
 tests/entrypoints/test_openai_vision.py    |   2 -
 tests/models/test_llava.py                 |  19 ++-
 tests/models/test_llava_next.py            |  20 ++--
 tests/models/test_phi3v.py                 |  18 ++-
 tests/multimodal/test_processor.py         |  40 +++----
 tests/tokenization/test_image_processor.py |  20 ----
 vllm/config.py                             |  36 +-----
 vllm/engine/arg_utils.py                   |  57 +--------
 vllm/entrypoints/openai/api_server.py      |   9 --
 vllm/entrypoints/openai/serving_chat.py    |  11 +-
 vllm/inputs.py                             |   6 +-
 vllm/model_executor/model_loader/loader.py |   2 +-
 vllm/model_executor/models/llava.py        |  77 +++----------
 vllm/model_executor/models/llava_next.py   |  85 ++++----------
 vllm/model_executor/models/phi3v.py        |  16 +--
 vllm/multimodal/image.py                   | 127 +++++----------------
 vllm/multimodal/registry.py                |  62 +++++-----
 vllm/multimodal/utils.py                   |   6 +-
 vllm/sequence.py                           |   8 +-
 vllm/transformers_utils/image_processor.py |   3 -
 vllm/worker/cpu_model_runner.py            |   2 +-
 vllm/worker/model_runner.py                |   2 +-
 27 files changed, 194 insertions(+), 522 deletions(-)
 delete mode 100644 tests/tokenization/test_image_processor.py

diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
index 389a12956c3c..360a7584bccf 100644
--- a/.buildkite/download-images.sh
+++ b/.buildkite/download-images.sh
@@ -8,10 +8,6 @@ set -o pipefail
 # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
 mkdir -p images
 cd images
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
-wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
 wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 1837dd2aa89f..169265078c7f 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
@@ -49,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+* ``multi_modal_data``: This is a loosely structured dict that contains multi modal data. 
 
 .. code-block:: python
 
@@ -61,7 +60,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {"image": image},
     })
 
     for o in outputs:
@@ -93,7 +92,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
 
     python -m vllm.entrypoints.openai.api_server \
         --model llava-hf/llava-1.5-7b-hf \
-        --image-input-type pixel_values \
         --image-token-id 32000 \
         --image-input-shape 1,3,336,336 \
         --image-feature-size 576 \
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 980d7bf9f8a3..c4ddab299fa5 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -2,37 +2,32 @@
 import os
 import subprocess
 
-import torch
 from PIL import Image
 
 from vllm import LLM
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
 # You can use `.buildkite/download-images.sh` to download them
 
 
-def run_llava_pixel_values(*, disable_image_processor: bool = False):
+def run_llava():
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
-        disable_image_processor=disable_image_processor,
     )
 
     prompt = "<image>" * 576 + (
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
-    if disable_image_processor:
-        image = torch.load("images/stop_sign_pixel_values.pt")
-    else:
-        image = Image.open("images/stop_sign.jpg")
+    image = Image.open("images/stop_sign.jpg")
 
     outputs = llm.generate({
         "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
+        "multi_modal_data": {
+            "image": image
+        },
     })
 
     for o in outputs:
@@ -40,45 +35,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         print(generated_text)
 
 
-def run_llava_image_features():
-    llm = LLM(
-        model="llava-hf/llava-1.5-7b-hf",
-        image_input_type="image_features",
-        image_token_id=32000,
-        image_input_shape="1,576,1024",
-        image_feature_size=576,
-    )
-
-    prompt = "<image>" * 576 + (
-        "\nUSER: What is the content of this image?\nASSISTANT:")
-
-    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
-
-    outputs = llm.generate({
-        "prompt": prompt,
-        "multi_modal_data": ImageFeatureData(image),
-    })
-
-    for o in outputs:
-        generated_text = o.outputs[0].text
-        print(generated_text)
-
-
-def main(args):
-    if args.type == "pixel_values":
-        run_llava_pixel_values()
-    else:
-        run_llava_image_features()
+def main():
+    run_llava()
 
 
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Demo on Llava")
-    parser.add_argument("--type",
-                        type=str,
-                        choices=["pixel_values", "image_features"],
-                        default="pixel_values",
-                        help="image input type")
-    args = parser.parse_args()
     # Download from s3
     s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
     local_directory = "images"
@@ -95,4 +56,4 @@ def main(args):
         local_directory,
         "--no-sign-request",
     ])
-    main(args)
+    main()
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 4f37c47ddca8..7d93a6404837 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -4,7 +4,6 @@
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 
 def run_phi3v():
@@ -12,11 +11,9 @@ def run_phi3v():
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
-        image_input_type="pixel_values",
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        disable_image_processor=False,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
@@ -30,7 +27,9 @@ def run_phi3v():
     outputs = llm.generate(
         {
             "prompt": prompt,
-            "multi_modal_data": ImagePixelData(image),
+            "multi_modal_data": {
+                "image": image
+            },
         },
         sampling_params=sampling_params)
     for o in outputs:
diff --git a/tests/conftest.py b/tests/conftest.py
index 9d00c7676694..c3a4c63fd7ce 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,13 +17,13 @@
                           AutoProcessor, AutoTokenizer, BatchEncoding)
 
 from vllm import LLM, SamplingParams
-from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
+from vllm.config import TokenizerPoolConfig
 from vllm.distributed import (destroy_distributed_environment,
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.multimodal.image import ImageData
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -62,16 +62,8 @@ def pil_image(self) -> Image.Image:
     def for_hf(self) -> Image.Image:
         return self.pil_image
 
-    def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
-        image_input_type = vision_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if image_input_type == ImageInputType.IMAGE_FEATURES:
-            return ImageFeatureData(self.image_features)
-        if image_input_type == ImageInputType.PIXEL_VALUES:
-            return ImagePixelData(self.pil_image)
-
-        raise NotImplementedError
+    def for_vllm(self) -> Dict[str, Any]:
+        return {"image": self.pil_image}
 
 
 class _ImageAssetPrompts(TypedDict):
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
index 0e8d88b76ffe..c59381dfb0b9 100644
--- a/tests/entrypoints/test_openai_vision.py
+++ b/tests/entrypoints/test_openai_vision.py
@@ -42,8 +42,6 @@ def server():
         "--max-model-len",
         "4096",
         "--enforce-eager",
-        "--image-input-type",
-        "pixel_values",
         "--image-token-id",
         "32000",
         "--image-input-shape",
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index ac1d2ece62b2..e1dd57718690 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -24,17 +24,12 @@ def iter_llava_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_input_type=None,
+                                    image_feature_size=f,
+                                    image_token_id=32000,
+                                    image_input_shape=input_shape))
 
 
 model_and_vl_config = [
@@ -82,7 +77,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    vllm_images = [asset.for_vllm() for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index d36e503871ca..efab0a241044 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -32,16 +32,14 @@ def iter_llava_next_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32000,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(
+                   image_input_type=None,
+                   image_feature_size=f,
+                   image_token_id=32000,
+                   image_input_shape=input_shape,
+               ))
 
 
 model_and_vl_config = [
@@ -91,7 +89,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    vllm_images = [asset.for_vllm() for asset in image_assets]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 03c130466836..39275ee843e5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -26,16 +26,12 @@ def iter_phi3v_configs(model_name: str):
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
-        for input_type, input_shape in [
-            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
-        ]:
-            yield (model_name,
-                   VisionLanguageConfig(image_input_type=input_type,
-                                        image_feature_size=f,
-                                        image_token_id=32044,
-                                        image_input_shape=input_shape,
-                                        image_processor=model_name,
-                                        image_processor_revision=None))
+        input_shape = (1, 3, h, w)
+        yield (model_name,
+               VisionLanguageConfig(image_input_type=None,
+                                    image_feature_size=f,
+                                    image_token_id=32044,
+                                    image_input_shape=input_shape))
 
 
 model_and_vl_config = [
@@ -95,7 +91,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     """
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
+    vllm_images = [asset.for_vllm() for asset in image_assets]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 9ac48dfab678..52231c1b7b70 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -4,7 +4,7 @@
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.image import ImageData
 
 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
@@ -27,12 +27,10 @@ def test_clip_image_processor(image_assets, dtype):
         revision=None,
     )
     vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_input_type=None,
         image_token_id=32000,
         image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
         image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
     )
 
     for asset in image_assets:
@@ -41,7 +39,7 @@ def test_clip_image_processor(image_assets, dtype):
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(asset.pil_image),
+            ImageData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
@@ -75,14 +73,11 @@ def test_llava_next_image_processor(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=64000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=2928,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
+    vlm_config = VisionLanguageConfig(image_input_type=None,
+                                      image_token_id=64000,
+                                      image_input_shape=(1, 3, IMAGE_HEIGHT,
+                                                         IMAGE_WIDTH),
+                                      image_feature_size=2928)
 
     for asset in image_assets:
         hf_result = hf_processor.preprocess(
@@ -90,7 +85,7 @@ def test_llava_next_image_processor(image_assets, dtype):
             return_tensors="pt",
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(asset.pil_image),
+            ImageData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
@@ -120,23 +115,20 @@ def test_image_pixel_types(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    vlm_config = VisionLanguageConfig(
-        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-        image_processor=MODEL_NAME,
-        image_processor_revision=None,
-    )
+    vlm_config = VisionLanguageConfig(image_input_type=None,
+                                      image_token_id=32000,
+                                      image_input_shape=(1, 3, IMAGE_HEIGHT,
+                                                         IMAGE_WIDTH),
+                                      image_feature_size=576)
 
     for asset in image_assets:
         image_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(asset.pil_image),
+            ImageData(asset.pil_image),
             model_config=model_config,
             vlm_config=vlm_config,
         )
         tensor_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(asset.pixel_values),
+            ImageData(asset.pixel_values),
             model_config=model_config,
             vlm_config=vlm_config,
         )
diff --git a/tests/tokenization/test_image_processor.py b/tests/tokenization/test_image_processor.py
deleted file mode 100644
index 5ba232336741..000000000000
--- a/tests/tokenization/test_image_processor.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import pytest
-from transformers.image_processing_utils import BaseImageProcessor
-
-from vllm.transformers_utils.image_processor import get_image_processor
-
-IMAGE_PROCESSOR_NAMES = [
-    "llava-hf/llava-1.5-7b-hf",
-    "llava-hf/llava-v1.6-34b-hf",
-]
-
-
-@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
-def test_image_processor_revision(processor_name: str):
-    # Assume that "main" branch always exists
-    image_processor = get_image_processor(processor_name, revision="main")
-    assert isinstance(image_processor, BaseImageProcessor)
-
-    # Assume that "never" branch always does not exist
-    with pytest.raises(OSError, match='not a valid git identifier'):
-        get_image_processor(processor_name, revision="never")
diff --git a/vllm/config.py b/vllm/config.py
index 0c4d770e4684..368a41c3f329 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1154,28 +1154,13 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
             raise ValueError("LoRA is not supported with chunked prefill yet.")
 
 
+# TODO: To be replaced by MultiModalConfig.
 @dataclass
 class VisionLanguageConfig:
     """Configs the input data format and how models should run for
     vision language models."""
 
-    class ImageInputType(enum.Enum):
-        """Image input type into the vision language model.
-
-        An image roughly goes through the following transformation:
-        Raw image --> pixel values --> image features --> image embeddings.
-
-        The difference between different image input types is where the
-        image encoder (pixel values --> image features) is run.
-        Different image input types also correspond to different tensor shapes.
-
-        For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336).
-        IMAGE_FEATURES: (1, 576, 1024).
-        """
-        PIXEL_VALUES = enum.auto()
-        IMAGE_FEATURES = enum.auto()
-
-    image_input_type: ImageInputType
+    image_input_type: None
     # The input id corresponding to image token.
     image_token_id: int
     # Used for running `run_prefill_max_token`.
@@ -1183,19 +1168,6 @@ class ImageInputType(enum.Enum):
     # worst case scenario (biggest supported resolution).
     image_input_shape: tuple
     image_feature_size: int
-    # The image processor to load from HuggingFace
-    image_processor: Optional[str]
-    image_processor_revision: Optional[str]
-
-    @classmethod
-    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
-        """Get the image input type from a string."""
-        try:
-            return cls.ImageInputType[value.upper()]
-        except KeyError as e:
-            raise ValueError(f"{value} is not a valid choice. "
-                             f"Expecting to choose from "
-                             f"{[x.name for x in cls.ImageInputType]}.") from e
 
     #TODO(ywang96): make this a cached property once we refactor the
     # VisionLanguageConfig class.
@@ -1214,6 +1186,8 @@ def as_cli_args_dict(self) -> Dict[str, Any]:
         """
         result: Dict[str, Any] = {}
         for f in fields(self):
+            if f.name == "image_input_type":
+                continue
             value = getattr(self, f.name)
             if isinstance(value, enum.Enum):
                 result[f.name] = value.name.lower()
@@ -1222,8 +1196,6 @@ def as_cli_args_dict(self) -> Dict[str, Any]:
             else:
                 result[f.name] = value
 
-        result["disable_image_processor"] = self.image_processor is None
-
         return result
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 16374098b23d..afbf0b33d4c8 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1,7 +1,6 @@
 import argparse
 import dataclasses
 import json
-import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -80,13 +79,9 @@ class EngineArgs:
     preemption_mode: Optional[str] = None
 
     # Related to Vision-language models such as llava
-    image_input_type: Optional[str] = None
     image_token_id: Optional[int] = None
     image_input_shape: Optional[str] = None
     image_feature_size: Optional[int] = None
-    image_processor: Optional[str] = None
-    image_processor_revision: Optional[str] = None
-    disable_image_processor: bool = False
 
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
@@ -112,14 +107,6 @@ def __post_init__(self):
     @staticmethod
     def add_cli_args_for_vlm(
             parser: FlexibleArgumentParser) -> FlexibleArgumentParser:
-        parser.add_argument('--image-input-type',
-                            type=nullable_str,
-                            default=None,
-                            choices=[
-                                t.name.lower()
-                                for t in VisionLanguageConfig.ImageInputType
-                            ],
-                            help=('The image input type passed into vLLM.'))
         parser.add_argument('--image-token-id',
                             type=int,
                             default=None,
@@ -135,24 +122,6 @@ def add_cli_args_for_vlm(
             type=int,
             default=None,
             help=('The image feature size along the context dimension.'))
-        parser.add_argument(
-            '--image-processor',
-            type=str,
-            default=EngineArgs.image_processor,
-            help='Name or path of the huggingface image processor to use. '
-            'If unspecified, model name or path will be used.')
-        parser.add_argument(
-            '--image-processor-revision',
-            type=str,
-            default=None,
-            help='Revision of the huggingface image processor version to use. '
-            'It can be a branch name, a tag name, or a commit id. '
-            'If unspecified, will use the default version.')
-        parser.add_argument(
-            '--disable-image-processor',
-            action='store_true',
-            help='Disables the use of image processor, even if one is defined '
-            'for the model on huggingface.')
 
         return parser
 
@@ -742,33 +711,17 @@ def create_engine_config(self, ) -> EngineConfig:
             model_loader_extra_config=self.model_loader_extra_config,
         )
 
-        if self.image_input_type:
-            if (not self.image_token_id or not self.image_input_shape
-                    or not self.image_feature_size):
+        if self.image_token_id:
+            if (not self.image_input_shape or not self.image_feature_size):
                 raise ValueError(
-                    'Specify `image_token_id`, `image_input_shape` and '
-                    '`image_feature_size` together with `image_input_type`.')
-
-            if self.image_processor is None:
-                self.image_processor = self.model
-            if self.disable_image_processor:
-                if self.image_processor != self.model:
-                    warnings.warn(
-                        "You've specified an image processor "
-                        f"({self.image_processor}) but also disabled "
-                        "it via `--disable-image-processor`.",
-                        stacklevel=2)
-
-                self.image_processor = None
+                    'Specify `image_input_shape` and '
+                    '`image_feature_size` together with `image_token_id`.')
 
             vision_language_config = VisionLanguageConfig(
-                image_input_type=VisionLanguageConfig.
-                get_image_input_enum_type(self.image_input_type),
+                image_input_type=None,
                 image_token_id=self.image_token_id,
                 image_input_shape=str_to_int_tuple(self.image_input_shape),
                 image_feature_size=self.image_feature_size,
-                image_processor=self.image_processor,
-                image_processor_revision=self.image_processor_revision,
             )
         else:
             vision_language_config = None
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index ea6275920c79..b56d656bf610 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -184,15 +184,6 @@ async def authentication(request: Request, call_next):
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
 
-    # Enforce pixel values as image input type for vision language models
-    # when serving with API server
-    if engine_args.image_input_type is not None and \
-        engine_args.image_input_type.upper() != "PIXEL_VALUES":
-        raise ValueError(
-            f"Invalid image_input_type: {engine_args.image_input_type}. "
-            "Only --image-input-type 'pixel_values' is supported for serving "
-            "vision language models with the vLLM API server.")
-
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 744e1d94511b..bd0d82545ca5 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.image import ImageData
 from vllm.multimodal.utils import (async_get_and_parse_image,
                                    get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
@@ -47,8 +47,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImagePixelData]] = field(
-        default_factory=list)
+    image_futures: List[Awaitable[ImageData]] = field(default_factory=list)
 
 
 class OpenAIServingChat(OpenAIServing):
@@ -103,7 +102,7 @@ def _parse_chat_message_content_parts(
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
-        image_futures: List[Awaitable[ImagePixelData]] = []
+        image_futures: List[Awaitable[ImageData]] = []
 
         vlm_config: Optional[VisionLanguageConfig] = getattr(
             self.engine.engine, "vision_language_config", None)
@@ -210,7 +209,7 @@ async def create_chat_completion(
 
         try:
             conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImagePixelData]] = []
+            image_futures: List[Awaitable[ImageData]] = []
 
             for msg in request.messages:
                 chat_parsed_result = self._parse_chat_message_content(msg)
@@ -228,7 +227,7 @@ async def create_chat_completion(
             return self.create_error_response(str(e))
 
         # Fetch image data
-        image_data: Optional[ImagePixelData] = None
+        image_data: Optional[ImageData] = None
         try:
             if len(image_futures):
                 # since we support only single image currently
diff --git a/vllm/inputs.py b/vllm/inputs.py
index 026903e19a26..518a342ada2c 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs.py
@@ -1,5 +1,5 @@
-from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
-                    TypedDict, Union, cast, overload)
+from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional,
+                    Sequence, TypedDict, Union, cast, overload)
 
 from typing_extensions import NotRequired
 
@@ -127,4 +127,4 @@ class TextTokensPrompt(TypedDict):
 class LLMInputs(TypedDict):
     prompt_token_ids: List[int]
     prompt: NotRequired[Optional[str]]
-    multi_modal_data: NotRequired[Optional["MultiModalData"]]
+    multi_modal_data: NotRequired[Optional[Dict[str, Any]]]
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index d3babcf9c345..9acc38a3dcd4 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -79,7 +79,7 @@ def _get_model_initialization_kwargs(
             "please open an issue on github.")
     elif issubclass(model_class, VisionLanguageModelBase):
         if vision_language_config is None:
-            raise ValueError("Provide `image_input_type` and other vision "
+            raise ValueError("Provide vision "
                              "related configurations through LLM entrypoint "
                              "or engine arguments.")
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8e36c54b1c51..3679916f2752 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,4 +1,4 @@
-from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -74,17 +74,10 @@ class LlavaImagePixelInputs(TypedDict):
     """Shape: (batch_size, num_channels, height, width)"""
 
 
-class LlavaImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, image_feature_size, hidden_size)"""
-
+LlavaImageInputs = LlavaImagePixelInputs
 
-LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
-
-@MULTIMODAL_REGISTRY.register_image_feature_input()
-@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_image_input()
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
@@ -97,8 +90,8 @@ def __init__(self,
 
         self.config = config
 
-        if self.vision_language_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
+        # TODO: To be replaced by `multi_modal_config`.
+        if self.vision_language_config:
             self.vision_tower = CLIPVisionModel(config.vision_config)
         else:
             self.vision_tower = None
@@ -137,44 +130,17 @@ def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
     def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
-        image_features = kwargs.pop("image_features", None)
-
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
-
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            return LlavaImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_data(pixel_values),
-            )
+        if pixel_values is None:
+            return None
 
-        if expected_input_type == ImageInputType.IMAGE_FEATURES:
-            if pixel_values is not None:
-                raise ValueError(
-                    "Expected image features but got pixel values")
-            if image_features is None:
-                return None
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-            if not isinstance(image_features, torch.Tensor):
-                raise ValueError("Incorrect type of image features. "
-                                 f"Got type: {type(image_features)}")
-
-            return LlavaImageFeatureInputs(
-                type="image_features",
-                data=self._validate_image_data(image_features),
-            )
-
-        return None
+        return LlavaImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_image_data(pixel_values),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -209,12 +175,8 @@ def _process_image_pixels(self,
 
     def _process_image_input(self,
                              image_input: LlavaImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
-
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
         return self.multi_modal_projector(image_features)
 
     def forward(
@@ -245,19 +207,12 @@ def forward(
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        This model has two modes of image inputs:
-        `PIXEL_VALUES` and `IMAGE_FEATURES`.
-
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each input image.
                 Expects a batch with shape `[1, 3, 336, 336]`.
                 (Only applicable to `PIXEL_VALUES` mode)
-            image_features: The image features for each input image outputted by
-                the vision tower before passing to the multi-modal projector.
-                Expects a batch with shape `[1, 576, 1024]`.
-                (Only applicable to `IMAGE_FEATURES` mode)
 
         See also:
             Each input maps to huggingface implementation, as follows:
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c1158c933c88..9617518d4e31 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,5 +1,4 @@
-from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
-                    Union)
+from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -22,7 +21,7 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
+from vllm.multimodal.image import ImageData, get_dummy_image_data
 from vllm.sequence import SamplerOutput, SequenceData
 
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
@@ -45,17 +44,7 @@ class LlavaNextImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
-class LlavaNextImageFeatureInputs(TypedDict):
-    type: Literal["image_features"]
-    data: torch.Tensor
-    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
-
-    image_sizes: NotRequired[torch.Tensor]
-    """Shape: (batch_size, 2)"""
-
-
-LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
-                             LlavaNextImageFeatureInputs]
+LlavaNextImageInputs = LlavaNextImagePixelInputs
 
 
 def _get_dummy_image_data(
@@ -66,19 +55,15 @@ def _get_dummy_image_data(
     seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
                                                   vlm_config)
 
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
-
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        _, c, h, w = vlm_config.image_input_shape
-        mode = {1: "L", 3: "RGB"}[c]
-        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+    _, c, h, w = vlm_config.image_input_shape
+    mode = {1: "L", 3: "RGB"}[c]
+    fake_mm_data = ImageData(Image.new(mode, (w, h), color=0))
 
     return seq_data, fake_mm_data
 
 
 def _image_pixel_processor(
-    data: ImagePixelData,
+    data: ImageData,
     model_config: ModelConfig,
     vlm_config: VisionLanguageConfig,
 ) -> Dict[str, torch.Tensor]:
@@ -100,11 +85,11 @@ def _image_pixel_processor(
 
         data.image = image.resize((w, h))
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+    return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \
         ._default_input_processor(data, model_config, vlm_config)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
+@MULTIMODAL_REGISTRY.register_image_input(_image_pixel_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
 class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
 
@@ -118,11 +103,7 @@ def __init__(self,
         # Update the type annotation from that of its superclass
         self.config = config
 
-        if self.vision_language_config.image_input_type == (
-                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
-            self.vision_tower = CLIPVisionModel(config=config.vision_config)
-        else:
-            raise TypeError("Image features are not supported by LLaVA-NeXT")
+        self.vision_tower = CLIPVisionModel(config=config.vision_config)
 
         self.multi_modal_projector = LlavaMultiModalProjector(
             vision_hidden_size=config.vision_config.hidden_size,
@@ -175,36 +156,23 @@ def _parse_and_validate_image_input(
             self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
-        image_features = kwargs.pop("image_features", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
+        if pixel_values is None:
+            return None
 
-        if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if image_features is not None:
-                raise ValueError(
-                    "Expected pixel values but got image features")
-            if pixel_values is None:
-                return None
-
-            if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values. "
-                                 f"Got type: {type(pixel_values)}")
-
-            if not isinstance(image_sizes, torch.Tensor):
-                raise ValueError("Incorrect type of image sizes. "
-                                 f"Got type: {type(image_sizes)}")
-
-            return LlavaNextImagePixelInputs(
-                type="pixel_values",
-                data=self._validate_image_pixels(pixel_values),
-                image_sizes=self._validate_image_sizes(image_sizes),
-            )
+        if not isinstance(pixel_values, torch.Tensor):
+            raise ValueError("Incorrect type of pixel values. "
+                             f"Got type: {type(pixel_values)}")
 
-        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
-            "Failed to validate this at initialization time")
+        if not isinstance(image_sizes, torch.Tensor):
+            raise ValueError("Incorrect type of image sizes. "
+                             f"Got type: {type(image_sizes)}")
 
-        return None
+        return LlavaNextImagePixelInputs(
+            type="pixel_values",
+            data=self._validate_image_pixels(pixel_values),
+            image_sizes=self._validate_image_sizes(image_sizes),
+        )
 
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
@@ -311,11 +279,8 @@ def _process_image_pixels(
 
     def _process_image_input(
             self, image_input: LlavaNextImageInputs) -> torch.Tensor:
-        if image_input["type"] == "pixel_values":
-            assert self.vision_tower is not None
-            image_features = self._process_image_pixels(image_input)
-        else:
-            image_features = image_input["data"]
+        assert self.vision_tower is not None
+        image_features = self._process_image_pixels(image_input)
 
         patch_embeddings = self.multi_modal_projector(image_features)
 
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index dac832a686c2..f3fa4dafc4f6 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,7 +35,7 @@
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
+from vllm.multimodal.image import ImageData, get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
 logger = init_logger(__name__)
@@ -309,7 +309,7 @@ def calc_hd_transform_size(width, height, hd_num=16):
 
 
 def _image_processor(
-    data: ImagePixelData,
+    data: ImageData,
     model_config: ModelConfig,
     vlm_config: VisionLanguageConfig,
 ) -> Dict[str, torch.Tensor]:
@@ -325,11 +325,11 @@ def _image_processor(
 
             data.image = image.resize((w, h))
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+    return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \
             ._default_input_processor(data, model_config, vlm_config)
 
 
-@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor)
+@MULTIMODAL_REGISTRY.register_image_input(_image_processor)
 @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class Phi3VForCausalLM(VisionLanguageModelBase):
 
@@ -352,14 +352,6 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type != ImageInputType.PIXEL_VALUES:
-            raise ValueError(
-                f"Unexpected image input type: {expected_input_type}."
-                "Phi3v only support pixel_values input currently.")
-
         if pixel_values is not None and image_sizes is not None:
             return Phi3VImagePixelInputs(type="pixel_values",
                                          data=pixel_values,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 08fb09d11160..fe8b3d5ef9a9 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,4 +1,4 @@
-from typing import Dict, Tuple, Type, Union
+from typing import Dict, Tuple, Type
 
 import torch
 from PIL import Image
@@ -12,26 +12,24 @@
 
 logger = init_logger(__name__)
 
+IMAGE_TOKEN_ID = 32000
+IMAGE_FEATURE_SIZE = 576
+IMAGE_SHAPE = (336, 336)
 
+
+# TODO: All the reference to `vlm_config` will be updated to `mm_config`.
+# TODO: This file should also be scoped to mm.
 def _get_dummy_seq_data(seq_len: int,
                         vlm_config: VisionLanguageConfig) -> SequenceData:
-    # NOTE: We assume that <image> token is repeated `image_feature_size` times
-    # and then concatenated with the text prompt
-    # TODO: Enable other ways of inserting the image into the prompt
-
-    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
-    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
-
+    assert seq_len >= IMAGE_FEATURE_SIZE, (
+        f"`seq_len` should be at least {IMAGE_FEATURE_SIZE}.")
+    token_ids = [IMAGE_TOKEN_ID] * IMAGE_FEATURE_SIZE
+    token_ids += [0] * (seq_len - IMAGE_FEATURE_SIZE)
     return SequenceData(token_ids)
 
 
-def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
-    if vlm_config.image_processor is None:
-        values_dtype = torch.float16
-    else:
-        values_dtype = torch.uint8
-
-    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
+def _get_dummy_image(vlm_config: VisionLanguageConfig) -> Image.Image:
+    return Image.new("RGB", IMAGE_SHAPE, color=(255, 255, 255))
 
 
 def get_dummy_image_data(
@@ -42,72 +40,41 @@ def get_dummy_image_data(
     """Standard dummy data factory for image data (to be used in
     :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
     seq_data = _get_dummy_seq_data(seq_len, vlm_config)
-    values = _get_dummy_values(vlm_config)
+    image = _get_dummy_image(vlm_config)
 
-    config_input_type = vlm_config.image_input_type
-    ImageInputType = VisionLanguageConfig.ImageInputType
+    return seq_data, ImageData(image)
 
-    fake_mm_data: MultiModalData
-    if config_input_type == ImageInputType.PIXEL_VALUES:
-        fake_mm_data = ImagePixelData(values)
-    elif config_input_type == ImageInputType.IMAGE_FEATURES:
-        fake_mm_data = ImageFeatureData(values)
-    else:
-        raise NotImplementedError
-
-    return seq_data, fake_mm_data
-
-
-class ImagePixelData(MultiModalData):
-    """
-    The pixel data of an image. Can be one of:
 
-    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
-      processor is available to the model.
-    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
-      without additional pre-processing.
+class ImageData(MultiModalData):
+    """An :class:``PIL.Image`` image. Requires that a HuggingFace
+    processor is available to the model.
     """
 
-    def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
-        if isinstance(image, Image.Image):
-            # So that this class can be created inside the Image context manager
-            image.load()
-
+    def __init__(self, image: Image.Image) -> None:
+        # So that this class can be created inside the Image context manager
+        image.load()
         self.image = image
 
     def __repr__(self) -> str:
-        image = self.image
-        if isinstance(image, Image.Image):
-            return f"{type(self).__name__}(image={image})"
-
-        return (f"{type(self).__name__}(image=torch.Tensor(shape="
-                f"{image.shape}, dtype={image.dtype}))")
+        return f"{type(self).__name__}(image={self.image})"
 
 
-class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
+class ImagePlugin(MultiModalPlugin[ImageData]):
 
-    def get_data_type(self) -> Type[ImagePixelData]:
-        return ImagePixelData
-
-    def _get_hf_image_processor(self, model_config: ModelConfig,
-                                vlm_config: VisionLanguageConfig):
-        if vlm_config is None or vlm_config.image_processor is None:
-            return None
+    def get_data_type(self) -> Type[ImageData]:
+        return ImageData
 
+    def _get_hf_image_processor(self, model_config: ModelConfig):
         return cached_get_image_processor(
-            vlm_config.image_processor,
-            trust_remote_code=model_config.trust_remote_code,
-            revision=vlm_config.image_processor_revision,
-        )
+            model_config.model,
+            trust_remote_code=model_config.trust_remote_code)
 
     def _default_input_processor(
-            self, data: ImagePixelData, model_config: ModelConfig,
+            self, data: ImageData, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
         image = data.image
-
         if isinstance(image, Image.Image):
-            image_processor = self._get_hf_image_processor(
-                model_config, vlm_config)
+            image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
@@ -117,39 +84,5 @@ def _default_input_processor(
             except Exception:
                 logger.error("Failed to process image (%s)", image)
                 raise
-        elif isinstance(image, torch.Tensor):
-            pixel_values = image.to(model_config.dtype)
-
-            return {"pixel_values": pixel_values}
 
         raise TypeError(f"Invalid image type: {type(image)}")
-
-
-class ImageFeatureData(MultiModalData):
-    """
-    The feature vector of an image, passed directly to the model.
-
-    This should be the output of the vision tower.
-    """
-
-    def __init__(self, image_features: torch.Tensor) -> None:
-        self.image_features = image_features
-
-    def __repr__(self) -> str:
-        image_features = self.image_features
-
-        return (f"{type(self).__name__}(image_features=torch.Tensor(shape="
-                f"{image_features.shape}, dtype={image_features.dtype}))")
-
-
-class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
-
-    def get_data_type(self) -> Type[ImageFeatureData]:
-        return ImageFeatureData
-
-    def _default_input_processor(
-            self, data: ImageFeatureData, model_config: ModelConfig,
-            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
-        image_features = data.image_features.to(model_config.dtype)
-
-        return {"image_features": image_features}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4789ce5ce4cf..189346360201 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,13 +1,14 @@
 import functools
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
-                    Tuple, Type, TypeVar)
+                    Tuple, Type, TypeVar, Union)
+
+from PIL import Image
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
 
 from .base import MultiModalData, MultiModalPlugin
-from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
-                    ImagePixelPlugin)
+from .image import ImageData, ImagePlugin
 
 if TYPE_CHECKING:
     import torch
@@ -32,7 +33,7 @@ class MultiModalRegistry:
     according to its modality and the target model.
     """
 
-    DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
+    DEFAULT_PLUGINS = (ImagePlugin(), )
 
     def __init__(self,
                  *,
@@ -53,7 +54,17 @@ def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
 
         self._plugins_by_data_type[data_type] = plugin
 
-    def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
+    def _process_external_input(self, data, model_config: ModelConfig,
+                                vlm_config: VisionLanguageConfig):
+        if isinstance(data, Image.Image):
+            return self._get_plugin_for_internal_data_type(
+                ImageData).process_input(ImageData(data), model_config,
+                                         vlm_config)
+        msg = f"Unknown multi-modal data type: {type(data)}"
+        raise NotImplementedError(msg)
+
+    def _get_plugin_for_internal_data_type(self,
+                                           data_type: Type[MultiModalData]):
         for typ in data_type.mro():
             plugin = self._plugins_by_data_type.get(typ)
             if plugin is not None:
@@ -105,41 +116,40 @@ def register_input(
 
         See :meth:`MultiModalPlugin.register_input_processor` for more details.
         """
-        return self._get_plugin_for_data_type(data_type) \
+        return self._get_plugin_for_internal_data_type(data_type) \
             .register_input_processor(processor)
 
-    def register_image_pixel_input(
+    def register_image_input(
             self,
-            processor: Optional[
-                MultiModalInputProcessor[ImagePixelData]] = None):
+            processor: Optional[MultiModalInputProcessor[ImageData]] = None):
         """
         Register an input processor for image pixel data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_processor` for more details.
         """
-        return self.register_input(ImagePixelData, processor)
-
-    def register_image_feature_input(
-        self,
-        processor: Optional[
-            MultiModalInputProcessor[ImageFeatureData]] = None):
-        """
-        Register an input processor for image feature data to a model class.
+        return self.register_input(ImageData, processor)
 
-        See :meth:`MultiModalPlugin.register_input_processor` for more details.
-        """
-        return self.register_input(ImageFeatureData, processor)
-
-    def process_input(self, data: MultiModalData, model_config: ModelConfig,
+    def process_input(self, data: Union[MultiModalData, Dict[str, Any]],
+                      model_config: ModelConfig,
                       vlm_config: VisionLanguageConfig):
         """
-        Apply an input processor to a :class:`~MultiModalData` instance passed
-        to the model.
+        Apply an input processor before passing in to the model.
+
+        If the data is internally supplied (for profiling), 
+        it's of type :class:`~MultiModalData`.
+        If externally supplied through user API, it's of type dict. 
         
         See :meth:`MultiModalPlugin.process_input` for more details.
         """
-        return self._get_plugin_for_data_type(type(data)) \
-            .process_input(data, model_config, vlm_config)
+        if isinstance(data, MultiModalData):
+            return self._get_plugin_for_internal_data_type(type(data)) \
+                .process_input(data, model_config, vlm_config)
+        else:
+            result_list = [
+                self._process_external_input(d, model_config, vlm_config)
+                for d in data.values()
+            ]
+            return {k: v for d in result_list for k, v in d.items()}
 
     def create_input_processor(self, model_config: ModelConfig,
                                vlm_config: VisionLanguageConfig):
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 0cf2c057f892..c80cd7adde58 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,7 +8,7 @@
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.image import ImageData
 
 
 class ImageFetchAiohttp:
@@ -56,9 +56,9 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
         return image
 
 
-async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
+async def async_get_and_parse_image(image_url: str) -> ImageData:
     with await ImageFetchAiohttp.fetch_image(image_url) as image:
-        return ImagePixelData(image)
+        return ImageData(image)
 
 
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 0925d15461fd..cc1ccb7f3cf0 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,7 +3,7 @@
 import enum
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -257,8 +257,8 @@ def prompt_token_ids(self) -> List[int]:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
-        return self.inputs.get("multi_modal_data")
+    def multi_modal_data(self) -> Dict[str, Any]:
+        return self.inputs.get("multi_modal_data") or {}
 
     @property
     def lora_int_id(self) -> int:
@@ -640,7 +640,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional["MultiModalData"] = None,
+        multi_modal_data: Optional[Dict[str, Any]] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3239b1d0cfa2..265a8ec99efe 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,5 +1,4 @@
 from functools import lru_cache
-from typing import Optional
 
 from transformers import AutoImageProcessor
 from transformers.image_processing_utils import BaseImageProcessor
@@ -13,7 +12,6 @@ def get_image_processor(
     processor_name: str,
     *args,
     trust_remote_code: bool = False,
-    revision: Optional[str] = None,
     **kwargs,
 ) -> BaseImageProcessor:
     """Gets an image processor for the given model name via HuggingFace."""
@@ -22,7 +20,6 @@ def get_image_processor(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
-            revision=revision,
             **kwargs)
     except ValueError as e:
         # If the error pertains to the processor class not existing or not
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index e3464c0d3900..f01c7d74ebe3 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -167,7 +167,7 @@ def _prepare_prompt(
             input_positions.extend(list(range(computed_len, seq_len)))
 
             mm_data = seq_group_metadata.multi_modal_data
-            if mm_data is not None:
+            if mm_data:
                 # Process multi-modal data
                 if self.multi_modal_input_processor is None:
                     raise ValueError(
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 9fdb2ea5dd4e..3f0b455aabab 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -509,7 +509,7 @@ def _prepare_model_input_tensors(
                      is not None else 1))
 
                 mm_data = seq_group_metadata.multi_modal_data
-                if mm_data is not None:
+                if mm_data:
                     # Process multi-modal data
                     if self.multi_modal_input_processor is None:
                         raise ValueError(

From c7a2a66bce58ed8864442a36eaa9172c30173f14 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 15:30:05 +0000
Subject: [PATCH 077/181] Update xfail to be more efficient - Also fix llava
 test

---
 tests/models/test_llava.py      |   3 +-
 tests/models/test_llava_next.py |  58 +++++++++++------
 tests/models/test_phi3v.py      | 108 +++++++++++++++++---------------
 3 files changed, 98 insertions(+), 71 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 61e67e55f53e..328195a11023 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -68,6 +68,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 dtype: str, max_tokens: int, is_multiscale: bool) -> None:
     """Inference result should be the same between hf and vllm.
@@ -117,6 +118,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except Exception as e:
+        except AssertionError as e:
             msg = f"Wrong output for size factor {size_factors[i]}"
             raise AssertionError(msg) from e
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 6ef113f34504..8d5256841be0 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,5 +1,6 @@
+import itertools
 import re
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
@@ -105,24 +106,28 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
     vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(prompt_inputs,
-                                              max_tokens,
-                                              images=hf_image_inputs)
-
-    with vllm_runner(
-            model_id,
-            dtype=dtype,
-            # should be greater than image_feature_size
-            max_model_len=4096,
-            enforce_eager=True,
-            **vlm_config.as_cli_args_dict(),
-    ) as vllm_model:
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     max_model_len=4096,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
                                                   max_tokens,
                                                   images=vllm_image_inputs)
 
-    for i in range(len(image_inputs)):
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(prompt_inputs,
+                                              max_tokens,
+                                              images=hf_image_inputs)
+        hf_dummy_outputs = hf_model.generate_greedy(prompt_inputs,
+                                                    max_tokens=1,
+                                                    images=hf_image_inputs)
+
+    # There may be numeric differences for multiscale images due to
+    # our implementation of CLIPVisionModel
+    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
+    for i in range(len(HF_IMAGE_PROMPTS)):
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
             vllm_output_ids, vllm_output_str = vllm_to_hf_output(
@@ -131,6 +136,23 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except Exception as e:
-            msg = f"Wrong output for size factor {size_factors[i]}"
-            raise AssertionError(msg) from e
+        except AssertionError as e:
+            num_match_tokens = sum(1 for _ in itertools.takewhile(
+                lambda pair: pair[0] == pair[1],
+                zip(hf_output_ids, vllm_output_ids),
+            ))
+            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+            best_max_tokens = num_match_tokens - num_prefix_tokens
+            best_max_tokens_exc_list.append((best_max_tokens, e))
+        else:
+            best_max_tokens_exc_list.append((max_tokens, None))
+
+    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    if best_max_tokens < max_tokens:
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+
+        pytest.xfail(
+            f"Test only fully passes when max_tokens={best_max_tokens} "
+            f"(instead of {max_tokens}). Errors encountered per item: "
+            f"{exc_list}")
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 0ffd555df859..875076c8cffa 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,5 +1,6 @@
+import itertools
 import re
-from typing import Dict, List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 
@@ -108,59 +109,62 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
     vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
 
-    def run_test(max_tokens: int):
-        # use eager mode for hf runner, since phi3_v didn't work with flash_attn
-        hf_model_kwargs = {"_attn_implementation": "eager"}
-        with hf_runner(model_id, dtype=dtype,
-                       model_kwargs=hf_model_kwargs) as hf_model:
-            hf_outputs = hf_model.generate_greedy(
-                prompt_inputs,
-                max_tokens,
-                images=hf_image_inputs,
-                eos_token_id=hf_model.processor.tokenizer.eos_token_id)
-
-        with vllm_runner(model_id,
-                         max_model_len=2048,
-                         dtype=dtype,
-                         enforce_eager=True,
-                         **vlm_config.as_cli_args_dict()) as vllm_model:
-            vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
-                                                      max_tokens,
-                                                      images=vllm_image_inputs)
-
-        for i in range(len(HF_IMAGE_PROMPTS)):
-            try:
-                hf_output_ids, hf_output_str = hf_outputs[i]
-                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                    vllm_outputs[i], vlm_config, model_id)
-                assert hf_output_str == vllm_output_str, (
-                    f"Test{i}:\nHF: {hf_output_str!r}\n"
-                    f"vLLM: {vllm_output_str!r}")
-                assert hf_output_ids == vllm_output_ids, (
-                    f"Test{i}:\nHF: {hf_output_ids}\n"
-                    f"vLLM: {vllm_output_ids}")
-            except Exception as e:
-                msg = f"Wrong output for size factor {size_factors[i]}"
-                raise AssertionError(msg) from e
-
-    # Since we use _attn_implementation="eager" for hf_runner, there is numeric
+    # max_model_len should be greater than image_feature_size
+    with vllm_runner(model_id,
+                     max_model_len=2048,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
+                                                  max_tokens,
+                                                  images=vllm_image_inputs)
+
+    # use eager mode for hf runner, since phi3_v didn't work with flash_attn
+    hf_model_kwargs = {"_attn_implementation": "eager"}
+    with hf_runner(model_id, dtype=dtype,
+                   model_kwargs=hf_model_kwargs) as hf_model:
+        hf_outputs = hf_model.generate_greedy(
+            prompt_inputs,
+            max_tokens,
+            images=hf_image_inputs,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id,
+        )
+        hf_dummy_outputs = hf_model.generate_greedy(
+            prompt_inputs,
+            max_tokens=1,
+            images=hf_image_inputs,
+            eos_token_id=hf_model.processor.tokenizer.eos_token_id,
+        )
+
+    # Since we use _attn_implementation="eager", there is numeric
     # difference for longer context (max_tokens=128) and test can't pass
-    fallback_tokens = max_tokens
-    assert_fails: Dict[int, AssertionError] = {}
-    while True:
+    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
+    for i in range(len(HF_IMAGE_PROMPTS)):
         try:
-            run_test(fallback_tokens)
+            hf_output_ids, hf_output_str = hf_outputs[i]
+            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                vllm_outputs[i], vlm_config, model_id)
+            assert hf_output_str == vllm_output_str, (
+                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+            assert hf_output_ids == vllm_output_ids, (
+                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except AssertionError as e:
-            if fallback_tokens == 1:
-                raise
-
-            assert_fails[fallback_tokens] = e
-            fallback_tokens //= 2
+            num_match_tokens = sum(1 for _ in itertools.takewhile(
+                lambda pair: pair[0] == pair[1],
+                zip(hf_output_ids, vllm_output_ids),
+            ))
+            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+            best_max_tokens = num_match_tokens - num_prefix_tokens
+            best_max_tokens_exc_list.append((best_max_tokens, e))
         else:
-            if assert_fails:
-                pytest.xfail("Phi-3-Vision test only passed when max_tokens="
-                             f"{fallback_tokens} (instead of {max_tokens}). "
-                             "Errors encountered for each max_tokens value: "
-                             f"{assert_fails}")
+            best_max_tokens_exc_list.append((max_tokens, None))
+
+    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    if best_max_tokens < max_tokens:
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
 
-            return
+        pytest.xfail(
+            f"Test only fully passes when max_tokens={best_max_tokens} "
+            f"(instead of {max_tokens}). Errors encountered per item: "
+            f"{exc_list}")

From 598e0e3a349734058b98acaa5c737092e382750d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 26 Jun 2024 16:16:32 +0000
Subject: [PATCH 078/181] Also xfail llava test

---
 tests/models/test_llava.py | 41 ++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 328195a11023..edf037d0a33a 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,4 +1,5 @@
-from typing import List, Tuple
+import itertools
+from typing import List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
@@ -96,11 +97,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
     vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
 
-    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(prompt_inputs,
-                                              max_tokens,
-                                              images=hf_image_inputs)
-
+    # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
                      dtype=dtype,
                      enforce_eager=True,
@@ -109,6 +106,17 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                                                   max_tokens,
                                                   images=vllm_image_inputs)
 
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(prompt_inputs,
+                                              max_tokens,
+                                              images=hf_image_inputs)
+        hf_dummy_outputs = hf_model.generate_greedy(prompt_inputs,
+                                                    max_tokens=1,
+                                                    images=hf_image_inputs)
+
+    # There may be numeric differences for multiscale images due to
+    # our implementation of CLIPVisionModel
+    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
     for i in range(len(HF_IMAGE_PROMPTS)):
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
@@ -119,5 +127,22 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             assert hf_output_ids == vllm_output_ids, (
                 f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
         except AssertionError as e:
-            msg = f"Wrong output for size factor {size_factors[i]}"
-            raise AssertionError(msg) from e
+            num_match_tokens = sum(1 for _ in itertools.takewhile(
+                lambda pair: pair[0] == pair[1],
+                zip(hf_output_ids, vllm_output_ids),
+            ))
+            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+            best_max_tokens = num_match_tokens - num_prefix_tokens
+            best_max_tokens_exc_list.append((best_max_tokens, e))
+        else:
+            best_max_tokens_exc_list.append((max_tokens, None))
+
+    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    if best_max_tokens < max_tokens:
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+
+        pytest.xfail(
+            f"Test only fully passes when max_tokens={best_max_tokens} "
+            f"(instead of {max_tokens}). Errors encountered per item: "
+            f"{exc_list}")

From 174ca90ad10362751e9ccb2138a31d83fdb6c6e4 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Wed, 26 Jun 2024 12:51:27 -0700
Subject: [PATCH 079/181] address comments

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 examples/llava_example.py   |  1 -
 tests/conftest.py           |  1 -
 vllm/inputs.py              |  9 ++++---
 vllm/multimodal/__init__.py |  9 ++++---
 vllm/multimodal/base.py     | 30 +++++++++++++++------
 vllm/multimodal/image.py    |  5 +++-
 vllm/multimodal/registry.py | 52 +++++++++++++++++++++++++------------
 vllm/sequence.py            |  8 +++---
 8 files changed, 76 insertions(+), 39 deletions(-)

diff --git a/examples/llava_example.py b/examples/llava_example.py
index c4ddab299fa5..7f3d84f99f76 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -1,4 +1,3 @@
-import argparse
 import os
 import subprocess
 
diff --git a/tests/conftest.py b/tests/conftest.py
index c3a4c63fd7ce..eff34e9d2937 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,7 +23,6 @@
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
 from vllm.multimodal import MultiModalData
-from vllm.multimodal.image import ImageData
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
diff --git a/vllm/inputs.py b/vllm/inputs.py
index 518a342ada2c..71487e89a97c 100644
--- a/vllm/inputs.py
+++ b/vllm/inputs.py
@@ -1,10 +1,10 @@
-from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional,
-                    Sequence, TypedDict, Union, cast, overload)
+from typing import (TYPE_CHECKING, Dict, List, Literal, Optional, Sequence,
+                    TypedDict, Union, cast, overload)
 
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData
 
 
 class ParsedText(TypedDict):
@@ -125,6 +125,7 @@ class TextTokensPrompt(TypedDict):
 
 
 class LLMInputs(TypedDict):
+    """A structured class to construct :class:`Sequence` with. """
     prompt_token_ids: List[int]
     prompt: NotRequired[Optional[str]]
-    multi_modal_data: NotRequired[Optional[Dict[str, Any]]]
+    multi_modal_data: NotRequired[Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]]]
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 270012e7d1c3..e2384ef5f8ad 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,7 +1,10 @@
-from .base import MultiModalData, MultiModalPlugin
+from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin
 from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
 
 __all__ = [
-    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
-    "MultiModalRegistry"
+    "MultiModalData",
+    "MultiModalPlugin",
+    "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry",
+    "EXTERNAL_MM_DATA_TYPE",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 847752449ba8..da68baf7f3d4 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,12 +1,13 @@
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
-                    TypeVar)
+from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Tuple,
+                    Type, TypeVar, Union)
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
     import torch
+    from PIL import Image
     from torch import nn
 
 logger = init_logger(__name__)
@@ -22,7 +23,7 @@ class MultiModalData:
     :class:`~MultiModalPlugin`.
 
     Finally, register the new plugin to
-    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+    :const:`vllm.multimodal.MULTIMODAL_REGISTRY` (beyond the default plugins).
     This enables models to call :meth:`MultiModalRegistry.register_input` for
     the new modality.
     """
@@ -32,6 +33,8 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
+EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"]
+
 MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
                                     Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
@@ -62,13 +65,23 @@ def __init__(self) -> None:
                                      MultiModalInputProcessor[D]] = {}
 
     @abstractmethod
-    def get_data_type(self) -> Type[D]:
+    def get_internal_data_type(self) -> Type[D]:
         """
         Get the modality (subclass of :class:`~MultiModalData`) served by
         this plugin.
         """
         raise NotImplementedError
 
+    @abstractmethod
+    def get_external_data_type(self) -> Tuple[str, EXTERNAL_MM_DATA_TYPE]:
+        """The data type that this plugin handles. 
+        
+        For `LLM.generate(multi_modal_data={"key": value})` will 
+        be handled by plugin with an external data type of
+        (key, type(value)). 
+        """
+        raise NotImplementedError
+
     @abstractmethod
     def _default_input_processor(
             self, data: D, model_config: ModelConfig,
@@ -85,10 +98,11 @@ def register_input_processor(self,
         """
         Register an input processor to a model class.
         
-        When the model receives input data that matches the modality served by
-        this plugin (see :meth:`get_data_type`), the provided input processor is
-        applied to preprocess the data. If `None` is provided, then the default
-        input processor is applied instead.
+        When LLM receives input data that matches the modality served by
+        this plugin (see :meth:`get_internal_data_type`), the provided input
+        processor is applied to preprocess the data. 
+        If `None` is provided, then the default input processor is applied 
+        instead.
         """
 
         def wrapper(model_cls: N) -> N:
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index fe8b3d5ef9a9..e32e761bb95f 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -61,9 +61,12 @@ def __repr__(self) -> str:
 
 class ImagePlugin(MultiModalPlugin[ImageData]):
 
-    def get_data_type(self) -> Type[ImageData]:
+    def get_internal_data_type(self) -> Type[ImageData]:
         return ImageData
 
+    def get_external_data_type(self) -> Tuple[str, Type[Image.Image]]:
+        return ("image", Image.Image)
+
     def _get_hf_image_processor(self, model_config: ModelConfig):
         return cached_get_image_processor(
             model_config.model,
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 189346360201..e479384dff2c 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -2,12 +2,10 @@
 from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
                     Tuple, Type, TypeVar, Union)
 
-from PIL import Image
-
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin
 from .image import ImageData, ImagePlugin
 
 if TYPE_CHECKING:
@@ -31,6 +29,8 @@ class MultiModalRegistry:
     """
     This registry is used by model runners to dispatch data processing
     according to its modality and the target model.
+
+    The registry handles both external and internal data input.
     """
 
     DEFAULT_PLUGINS = (ImagePlugin(), )
@@ -39,34 +39,51 @@ def __init__(self,
                  *,
                  plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
                  ) -> None:
-        self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
+        self._plugins_by_internal_data_type = {
+            p.get_internal_data_type(): p
+            for p in plugins
+        }
+        self._plugins_by_external_data_type = {
+            p.get_external_data_type(): p
+            for p in plugins
+        }
         self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
                                                   MultiModalDummyFactory] = {}
 
     def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
-        data_type = plugin.get_data_type()
+        data_type = plugin.get_internal_data_type()
 
-        if data_type in self._plugins_by_data_type:
+        if data_type in self._plugins_by_internal_data_type:
             logger.warning(
                 "A plugin is already registered for data type %s, "
                 "and will be overwritten by the new plugin %s.", data_type,
                 plugin)
 
-        self._plugins_by_data_type[data_type] = plugin
+        self._plugins_by_internal_data_type[data_type] = plugin
 
-    def _process_external_input(self, data, model_config: ModelConfig,
+    def _process_external_input(self, key, value, model_config: ModelConfig,
                                 vlm_config: VisionLanguageConfig):
-        if isinstance(data, Image.Image):
-            return self._get_plugin_for_internal_data_type(
-                ImageData).process_input(ImageData(data), model_config,
-                                         vlm_config)
-        msg = f"Unknown multi-modal data type: {type(data)}"
+        plugin = self._get_plugin_for_external_data_type(key, type(value))
+        if plugin:
+            return plugin.process_input(plugin.get_internal_data_type()(value),
+                                        model_config, vlm_config)
+        msg = f"Unknown multi-modal data type: {type(value)}"
+        raise NotImplementedError(msg)
+
+    def _get_plugin_for_external_data_type(self, key: str,
+                                           data_type: Type[Any]):
+        for typ in data_type.mro():
+            plugin = self._plugins_by_external_data_type.get((key, typ))
+            if plugin is not None:
+                return plugin
+
+        msg = f"Unknown multi-modal data type: {data_type}"
         raise NotImplementedError(msg)
 
     def _get_plugin_for_internal_data_type(self,
                                            data_type: Type[MultiModalData]):
         for typ in data_type.mro():
-            plugin = self._plugins_by_data_type.get(typ)
+            plugin = self._plugins_by_internal_data_type.get(typ)
             if plugin is not None:
                 return plugin
 
@@ -129,7 +146,8 @@ def register_image_input(
         """
         return self.register_input(ImageData, processor)
 
-    def process_input(self, data: Union[MultiModalData, Dict[str, Any]],
+    def process_input(self, data: Union[MultiModalData,
+                                        Dict[str, EXTERNAL_MM_DATA_TYPE]],
                       model_config: ModelConfig,
                       vlm_config: VisionLanguageConfig):
         """
@@ -146,8 +164,8 @@ def process_input(self, data: Union[MultiModalData, Dict[str, Any]],
                 .process_input(data, model_config, vlm_config)
         else:
             result_list = [
-                self._process_external_input(d, model_config, vlm_config)
-                for d in data.values()
+                self._process_external_input(k, v, model_config, vlm_config)
+                for k, v in data.items()
             ]
             return {k: v for d in result_list for k, v in d.items()}
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index cc1ccb7f3cf0..ebc56f65c465 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -3,7 +3,7 @@
 import enum
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
 import torch
 
@@ -14,7 +14,7 @@
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -257,7 +257,7 @@ def prompt_token_ids(self) -> List[int]:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> Dict[str, Any]:
+    def multi_modal_data(self) -> Dict[str, "EXTERNAL_MM_DATA_TYPE"]:
         return self.inputs.get("multi_modal_data") or {}
 
     @property
@@ -640,7 +640,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional[Dict[str, Any]] = None,
+        multi_modal_data: Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:

From 5b3e9aae8a2eccd69d5e6d6d8d602721489be470 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Wed, 26 Jun 2024 13:23:58 -0700
Subject: [PATCH 080/181] remove image_input_type altogether.

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 tests/models/test_llava.py         | 3 +--
 tests/models/test_llava_next.py    | 1 -
 tests/models/test_phi3v.py         | 3 +--
 tests/multimodal/test_processor.py | 7 ++-----
 vllm/config.py                     | 4 ----
 vllm/engine/arg_utils.py           | 1 -
 6 files changed, 4 insertions(+), 15 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index e1dd57718690..17cec7fc61ff 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -26,8 +26,7 @@ def iter_llava_configs(model_name: str):
     for (h, w), f in image_hw_to_feature_size.items():
         input_shape = (1, 3, h, w)
         yield (model_name,
-               VisionLanguageConfig(image_input_type=None,
-                                    image_feature_size=f,
+               VisionLanguageConfig(image_feature_size=f,
                                     image_token_id=32000,
                                     image_input_shape=input_shape))
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index efab0a241044..0cf2942e94c7 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -35,7 +35,6 @@ def iter_llava_next_configs(model_name: str):
         input_shape = (1, 3, h, w)
         yield (model_name,
                VisionLanguageConfig(
-                   image_input_type=None,
                    image_feature_size=f,
                    image_token_id=32000,
                    image_input_shape=input_shape,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 39275ee843e5..e9cc5e826296 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -28,8 +28,7 @@ def iter_phi3v_configs(model_name: str):
     for (h, w), f in image_hw_to_feature_size.items():
         input_shape = (1, 3, h, w)
         yield (model_name,
-               VisionLanguageConfig(image_input_type=None,
-                                    image_feature_size=f,
+               VisionLanguageConfig(image_feature_size=f,
                                     image_token_id=32044,
                                     image_input_shape=input_shape))
 
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 52231c1b7b70..12f37e431cae 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -27,7 +27,6 @@ def test_clip_image_processor(image_assets, dtype):
         revision=None,
     )
     vlm_config = VisionLanguageConfig(
-        image_input_type=None,
         image_token_id=32000,
         image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
         image_feature_size=576,
@@ -73,8 +72,7 @@ def test_llava_next_image_processor(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    vlm_config = VisionLanguageConfig(image_input_type=None,
-                                      image_token_id=64000,
+    vlm_config = VisionLanguageConfig(image_token_id=64000,
                                       image_input_shape=(1, 3, IMAGE_HEIGHT,
                                                          IMAGE_WIDTH),
                                       image_feature_size=2928)
@@ -115,8 +113,7 @@ def test_image_pixel_types(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    vlm_config = VisionLanguageConfig(image_input_type=None,
-                                      image_token_id=32000,
+    vlm_config = VisionLanguageConfig(image_token_id=32000,
                                       image_input_shape=(1, 3, IMAGE_HEIGHT,
                                                          IMAGE_WIDTH),
                                       image_feature_size=576)
diff --git a/vllm/config.py b/vllm/config.py
index 368a41c3f329..cfaef307a19e 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1159,8 +1159,6 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
 class VisionLanguageConfig:
     """Configs the input data format and how models should run for
     vision language models."""
-
-    image_input_type: None
     # The input id corresponding to image token.
     image_token_id: int
     # Used for running `run_prefill_max_token`.
@@ -1186,8 +1184,6 @@ def as_cli_args_dict(self) -> Dict[str, Any]:
         """
         result: Dict[str, Any] = {}
         for f in fields(self):
-            if f.name == "image_input_type":
-                continue
             value = getattr(self, f.name)
             if isinstance(value, enum.Enum):
                 result[f.name] = value.name.lower()
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index afbf0b33d4c8..03d527d5ce90 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -718,7 +718,6 @@ def create_engine_config(self, ) -> EngineConfig:
                     '`image_feature_size` together with `image_token_id`.')
 
             vision_language_config = VisionLanguageConfig(
-                image_input_type=None,
                 image_token_id=self.image_token_id,
                 image_input_shape=str_to_int_tuple(self.image_input_shape),
                 image_feature_size=self.image_feature_size,

From b7acf3a0b943bcc6618a924d14388878fa57ad5a Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Wed, 26 Jun 2024 14:28:12 -0700
Subject: [PATCH 081/181] types

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 docs/source/models/vlm.rst  | 2 +-
 vllm/multimodal/base.py     | 2 +-
 vllm/multimodal/registry.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 169265078c7f..d4613be64aa5 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This is a loosely structured dict that contains multi modal data. 
+* ``multi_modal_data``: This is a dictionary that contains multi-modal data.
 
 .. code-block:: python
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index da68baf7f3d4..4ddd8b53dd83 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -73,7 +73,7 @@ def get_internal_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def get_external_data_type(self) -> Tuple[str, EXTERNAL_MM_DATA_TYPE]:
+    def get_external_data_type(self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]:
         """The data type that this plugin handles. 
         
         For `LLM.generate(multi_modal_data={"key": value})` will 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index e479384dff2c..ed20c6fd59c1 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -77,7 +77,7 @@ def _get_plugin_for_external_data_type(self, key: str,
             if plugin is not None:
                 return plugin
 
-        msg = f"Unknown multi-modal data type: {data_type}"
+        msg = f"No plugin found for key {key} and type {data_type}"
         raise NotImplementedError(msg)
 
     def _get_plugin_for_internal_data_type(self,

From f22b2198cf99b6d1cd2c5067d0fc1d834d746ec4 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Wed, 26 Jun 2024 14:48:30 -0700
Subject: [PATCH 082/181] format

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 vllm/multimodal/base.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4ddd8b53dd83..233068c4a545 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -73,7 +73,8 @@ def get_internal_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def get_external_data_type(self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]:
+    def get_external_data_type(
+            self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]:
         """The data type that this plugin handles. 
         
         For `LLM.generate(multi_modal_data={"key": value})` will 

From f84d87ac7643ddcb0bf4b0ce16e5aa8ffea18272 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 00:58:23 +0000
Subject: [PATCH 083/181] Update comment

---
 vllm/worker/cpu_model_runner.py | 2 +-
 vllm/worker/model_runner.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index ac999f763f72..d4cfa209036a 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -110,7 +110,7 @@ def __init__(
             self.block_size,
         )
 
-        # Create processor for multi-modal data
+        # Multi-modal data support
         self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 6e69cd88e96b..7f4b3444c5f0 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -191,7 +191,7 @@ def __init__(
             self.block_size,
         ) if num_attn_heads else None
 
-        # Create processor for multi-modal data
+        # Multi-modal data support
         self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 

From 5dfb6fc5590d74dc33211657b124e57823b471d8 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 01:02:11 +0000
Subject: [PATCH 084/181] Update docs

---
 .../dev/input_processing/input_processing_pipeline.rst   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
index 80fcc379f382..e0c773781115 100644
--- a/docs/source/dev/input_processing/input_processing_pipeline.rst
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -4,8 +4,17 @@ Input Processing Pipeline
 =========================
 
 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
 2. Tokenize the data if necessary.
+
 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.

From bf3281cd7735693e62af89dbb7818de08aea2d89 Mon Sep 17 00:00:00 2001
From: ywang96 <ywang@roblox.com>
Date: Thu, 27 Jun 2024 05:05:29 +0000
Subject: [PATCH 085/181] modify llava_next

---
 vllm/model_executor/models/llava_next.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index b3d2b42ec1d4..596bab621f5d 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -125,8 +125,8 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
 
-    # Result in the max possible feature size
-    dummy_height = dummy_width = 448
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
     image_feature_size = _get_llava_next_image_feature_size(
         hf_config, input_height=dummy_height, input_width=dummy_width)
 

From 56e2d3b80f945ffd948115ce855cb2fe2f0998c6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 00:58:23 +0000
Subject: [PATCH 086/181] Update comment

---
 vllm/worker/cpu_model_runner.py | 2 +-
 vllm/worker/model_runner.py     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index ac999f763f72..d4cfa209036a 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -110,7 +110,7 @@ def __init__(
             self.block_size,
         )
 
-        # Create processor for multi-modal data
+        # Multi-modal data support
         self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 6e69cd88e96b..7f4b3444c5f0 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -191,7 +191,7 @@ def __init__(
             self.block_size,
         ) if num_attn_heads else None
 
-        # Create processor for multi-modal data
+        # Multi-modal data support
         self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
             .create_input_mapper(self.model_config)
 

From d2f8c6def2c9a02671d2d1b559132c4e04d3a6e7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 01:02:11 +0000
Subject: [PATCH 087/181] Update docs

---
 .../dev/input_processing/input_processing_pipeline.rst   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/source/dev/input_processing/input_processing_pipeline.rst b/docs/source/dev/input_processing/input_processing_pipeline.rst
index 80fcc379f382..e0c773781115 100644
--- a/docs/source/dev/input_processing/input_processing_pipeline.rst
+++ b/docs/source/dev/input_processing/input_processing_pipeline.rst
@@ -4,8 +4,17 @@ Input Processing Pipeline
 =========================
 
 1. Input data is passed to :class:`~vllm.LLMEngine` (or :class:`~vllm.AsyncLLMEngine`).
+
 2. Tokenize the data if necessary.
+
 3. Process the inputs using :meth:`INPUT_REGISTRY.process_input <vllm.inputs.registry.InputRegistry.process_input>`.
+
+   - For example, add placeholder tokens to reserve KV cache for multi-modal embeddings.
+
 4. Send the processed inputs to :class:`~vllm.executor.executor_base.ExecutorBase`.
+
 5. Distribute the inputs via :class:`~vllm.worker.worker_base.WorkerBase` to :class:`~vllm.worker.model_runner_base.ModelRunnerBase`.
+
 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
+
+   - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model.

From 7c197d274070bbd58562bab707c1b77c10adf2ce Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 05:27:26 +0000
Subject: [PATCH 088/181] Use dynamic image feature size calculation

---
 examples/phi3v_example.py           |  1 -
 vllm/model_executor/models/phi3v.py | 41 ++++++++++++++++++++++-------
 2 files changed, 31 insertions(+), 11 deletions(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 4f37c47ddca8..7db558d847eb 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -16,7 +16,6 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        disable_image_processor=False,
     )
 
     image = Image.open("images/cherry_blossom.jpg")
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 16b375613178..d1c34eebbccb 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -276,25 +276,44 @@ class Phi3VImagePixelInputs(TypedDict):
     """Shape: (batch_size, 2)"""
 
 
+def _get_phi3v_image_feature_size(
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    h, w = input_height, input_width
+
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
+    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
+
+
 def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    multimodal_config = ctx.get_multimodal_config()
+
+    #TODO: change the logic for dummy data to support dynamic shape
+    _, _, dummy_height, dummy_width = multimodal_config.image_input_shape
+    image_feature_size = _get_phi3v_image_feature_size(
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
     seq_data = dummy_seq_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         seq_len,
         image_token_id=32044,
-        image_feature_size_override=1921,
+        image_feature_size_override=image_feature_size,
     )
     mm_data = dummy_pixel_data_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        image_width_override=1344,
-        image_height_override=1008,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
     )
 
     return seq_data, mm_data
 
 
-# FIXME(Isotr0py): Remove these after dynamic num_img_tokens is supported
-# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_padded_size(width, height, padding_unit=336):
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
     top_padding = int((target_height - height) / 2)
     bottom_padding = target_height - height - top_padding
@@ -303,8 +322,8 @@ def calc_padded_size(width, height, padding_unit=336):
     return padded_width, padded_height
 
 
-# copied from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
-def calc_hd_transform_size(width, height, hd_num=16):
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     transposed = False
     if width < height:
         width, height = height, width
@@ -319,7 +338,8 @@ def calc_hd_transform_size(width, height, hd_num=16):
     new_width = int(scale * 336)
     new_height = int(new_width / ratio)
 
-    padded_width, padded_height = calc_padded_size(new_width, new_height)
+    padded_width, padded_height = _calc_padded_size(width=new_width,
+                                                    height=new_height)
 
     if transposed:
         padded_width, padded_height = padded_height, padded_width
@@ -334,7 +354,8 @@ def _image_processor(ctx: InputContext,
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
         _, _, h, w = ctx.get_multimodal_config().image_input_shape
-        if (w, h) != calc_hd_transform_size(image.width, image.height):
+        if (w, h) != _calc_hd_transform_size(width=image.width,
+                                             height=image.height):
             logger.warning(
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)

From f5ffd3e9e86a5e383f4887305d8937b1ea2ed67f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 06:05:18 +0000
Subject: [PATCH 089/181] Fix phi3v not handling `image_sizes` correctly

---
 vllm/model_executor/models/phi3v.py | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index d1c34eebbccb..53c5f4b6d517 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -68,12 +68,6 @@ def __init__(self, wte=None) -> None:
         self.type_feature: str
         self.img_processor: CLIPVisionModel
 
-    def set_img_features(self, img_features: torch.FloatTensor) -> None:
-        self.img_features = img_features
-
-    def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
-        self.img_sizes = img_sizes
-
     def get_img_features(self,
                          img_embeds: torch.FloatTensor) -> torch.FloatTensor:
         LAYER_IDX = self.layer_idx
@@ -115,7 +109,6 @@ def __init__(self,
         self.num_img_tokens = config.img_processor['num_img_tokens']
 
         self.image_dim_out = image_dim_out
-        self.img_sizes = None
 
         # global_gn and sub_gn for hd transform, serves as line separator
         self.use_hd_transform = config.embd_layer.get('use_hd_transform',
@@ -142,7 +135,6 @@ def __init__(self,
         self.img_projection = nn.Sequential(*layers)
 
         self.vocab_size = config.vocab_size
-        self.img_features = None
 
         self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
@@ -150,18 +142,11 @@ def __init__(self,
     def forward(self,
                 input_ids: torch.LongTensor,
                 pixel_values: torch.FloatTensor,
-                image_sizes=None) -> torch.FloatTensor:
+                image_sizes: Optional[torch.Tensor]) -> torch.FloatTensor:
         """process and merge text embeddings with image embeddings."""
 
         img_embeds = pixel_values
-        img_sizes = image_sizes
-
-        if self.img_features is not None:
-            img_embeds = self.img_features.clone()
-            self.img_features = None
-
-        if self.img_sizes is not None:
-            img_sizes = self.img_sizes
+        img_sizes = [] if image_sizes is None else image_sizes
 
         input_shape = input_ids.size()
         input_ids = input_ids.view(-1, input_shape[-1])
@@ -193,11 +178,8 @@ def forward(self,
             output_imgs = []
             output_len = []
 
-            if isinstance(img_sizes, torch.Tensor):
-                img_sizes.squeeze_(0)
-
             for _bs in range(bs):
-                h, w = img_sizes
+                h, w = img_sizes[_bs]
                 h = h // 336
                 w = w // 336
                 B_ = h * w

From 66aad21ac1460530e107e7610979ce5171e2de6d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 06:14:33 +0000
Subject: [PATCH 090/181] Apply formatter

---
 vllm/model_executor/models/phi3v.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 53c5f4b6d517..f74b13f9fd8a 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -139,8 +139,7 @@ def __init__(self,
         self.layer_idx = config.img_processor.get('layer_idx', -2)
         self.type_feature = config.img_processor.get('type_feature', 'patch')
 
-    def forward(self,
-                input_ids: torch.LongTensor,
+    def forward(self, input_ids: torch.LongTensor,
                 pixel_values: torch.FloatTensor,
                 image_sizes: Optional[torch.Tensor]) -> torch.FloatTensor:
         """process and merge text embeddings with image embeddings."""

From 5f32d534e5c8d986833e86ba69fd7f9578673141 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 01:07:27 +0000
Subject: [PATCH 091/181] Add see also

---
 docs/source/dev/multimodal/adding_multimodal_model.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index c09452fee587..c217fde4a5f9 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -54,6 +54,9 @@ This decorator accepts a function that maps multi-modal inputs to the keyword ar
 
 A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
 
+.. seealso::
+    :ref:`input_processing_pipeline`
+
 
 2. (Optional) Register dummy data
 ---------------------------------
@@ -75,6 +78,9 @@ Here are some examples:
 - Image inputs (static feature size): `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
 - Image inputs (dynamic feature size): `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
 
+.. seealso::
+    :ref:`input_processing_pipeline`
+
 
 3. (Optional) Register input processor
 --------------------------------------
@@ -97,3 +103,6 @@ Here are some examples:
 
 - Insert static number of image tokens: `LLaVA-1.5 Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava.py>`__
 - Insert dynamic number of image tokens: `LLaVA-NeXT Model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/llava_next.py>`__
+
+.. seealso::
+    :ref:`input_processing_pipeline`

From 15df4ef420c9d3c9f4abb2d93120d6e70061456f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 01:18:23 +0000
Subject: [PATCH 092/181] Update examples prompt format

---
 docs/source/models/vlm.rst     |  9 +++-----
 examples/llava_example.py      |  2 +-
 examples/llava_next_example.py | 38 ++++++++++++++++++++++++++++++++++
 examples/phi3v_example.py      |  2 --
 4 files changed, 42 insertions(+), 9 deletions(-)
 create mode 100644 examples/llava_next_example.py

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index e4cc65895996..4335f974ae57 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -47,12 +47,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
-* ``prompt``: The prompt is expected to have a single ``<image>`` token per image.
+* ``prompt``: The prompt should follow the same format as that for the HuggingFace version of the model.
 * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
 
 .. code-block:: python
 
-    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
     image = ...
@@ -68,8 +68,6 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
 
-.. important::
-    We will remove the need to format image tokens in a future release. Afterwards, the input text will follow the same format as that for the original HuggingFace model.
 
 Online OpenAI Vision API Compatible Inference
 ----------------------------------------------
@@ -139,5 +137,4 @@ A full code example can be found in `examples/openai_vision_api_client.py <https
         export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 .. note::
-    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
-    processed automatically by the server.
+    There is no need to format the prompt in the API request when since it will be handled by the server.
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 7d6433c91b19..b2e1d53b593f 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -48,7 +48,7 @@ def run_llava_image_features():
         image_feature_size=576,
     )
 
-    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
 
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
new file mode 100644
index 000000000000..8a25ad3d256b
--- /dev/null
+++ b/examples/llava_next_example.py
@@ -0,0 +1,38 @@
+from io import BytesIO
+
+import requests
+from PIL import Image
+
+from vllm import LLM, SamplingParams
+from vllm.multimodal.image import ImagePixelData
+
+# Dynamic image input is currently not supported and therefore
+# a fixed image input shape and its corresponding feature size is required.
+# See https://github.com/vllm-project/vllm/pull/4199 for the complete
+# configuration matrix.
+
+llm = LLM(
+    model="llava-hf/llava-v1.6-mistral-7b-hf",
+    image_input_type="pixel_values",
+    image_token_id=32000,
+    image_input_shape="1,3,336,336",
+    image_feature_size=1176,
+)
+
+prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
+url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+image = Image.open(BytesIO(requests.get(url).content))
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
+
+outputs = llm.generate(
+    {
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    },
+    sampling_params=sampling_params)
+
+generated_text = ""
+for o in outputs:
+    generated_text += o.outputs[0].text
+
+print(f"LLM output:{generated_text}")
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index 7db558d847eb..e9fcea7796d0 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -22,8 +22,6 @@ def run_phi3v():
 
     # single-image prompt
     prompt = "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n"  # noqa: E501
-    prompt = prompt.replace("<|image_1|>", "<|image|>" * 1921 + "<s>")
-
     sampling_params = SamplingParams(temperature=0, max_tokens=64)
 
     outputs = llm.generate(

From ce06541ecbce9b82c03f0e81eba7bfe0829afa04 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 08:46:02 +0000
Subject: [PATCH 093/181] Fix config

---
 vllm/model_executor/models/phi3v.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e3d99e71ce27..674287eadcfe 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -358,14 +358,17 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
+
         self.config = config
+        self.vlm_config = vlm_config
+
         self.model = LlamaModel(config, cache_config, quant_config)
         self.vision_embed_tokens = Phi3HDImageEmbedding(
-            vision_language_config, config, self.model.embed_tokens)
+            vlm_config, config, self.model.embed_tokens)
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()
@@ -375,7 +378,7 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type != ImageInputType.PIXEL_VALUES:

From cdcc2d448108a879694948014f6620585be8afd7 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 08:46:02 +0000
Subject: [PATCH 094/181] Fix config

---
 vllm/model_executor/models/phi3v.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 009deaed3700..c40fdf04ae49 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -403,19 +403,20 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,
                  config: PretrainedConfig,
-                 vision_language_config: VisionLanguageConfig,
+                 vlm_config: VisionLanguageConfig,
                  cache_config: Optional[CacheConfig] = None,
                  quant_config: Optional[QuantizationConfig] = None) -> None:
-        super().__init__(vision_language_config)
+        super().__init__()
 
         self.config = config
+        self.vlm_config = vlm_config
 
         self.model = LlamaModel(config, cache_config, quant_config)
 
-        if self.vision_language_config.image_input_type == (
+        if self.vlm_config.image_input_type == (
                 VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
             self.vision_embed_tokens = Phi3HDImageEmbedding(
-                vision_language_config, config, self.model.embed_tokens)
+                vlm_config, config, self.model.embed_tokens)
         else:
             raise TypeError("Image features are not supported by LLaVA-NeXT")
 
@@ -428,7 +429,7 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
-        expected_input_type = self.vision_language_config.image_input_type
+        expected_input_type = self.vlm_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
         if expected_input_type == ImageInputType.PIXEL_VALUES:

From 4212abfe0177f191d652f03e03c4dbeae6fcb15f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 08:58:44 +0000
Subject: [PATCH 095/181] Update docs

---
 .../multimodal/adding_multimodal_model.rst    | 77 +++++++++++--------
 1 file changed, 46 insertions(+), 31 deletions(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index c217fde4a5f9..73de441a1574 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -15,42 +15,53 @@ This document provides a high-level guide on integrating a :ref:`multimodal mode
     We will be happy to help you out!
 
 
-0. Set up a base vLLM model
+1. Set up a base vLLM model
 ---------------------------
 
-Follow :ref:`these steps <adding_a_new_model>` to first implement the model in vLLM.
-While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
-for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
 
-.. code-block:: diff
+- You should additionally implement the :class:`~vllm.model_executor.models.interfaces.SupportsVision` interface.
 
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        kv_caches: List[torch.Tensor],
-        attn_metadata: AttentionMetadata,
-    +   pixel_values: torch.Tensor,
-    ) -> SamplerOutput:
+  .. code-block:: diff
 
-.. note::
-    The model class does not have to be named :code:`*ForCausalLM`.
-    Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+      + from vllm.model_executor.models.interfaces import SupportsVision
+
+      - class YourModelForImage2Seq(nn.Module):
+      + class YourModelForImage2Seq(nn.Module, SupportsVision):
+
+  .. note::
+      The model class does not have to be named :code:`*ForCausalLM`.
+      Check out `the HuggingFace Transformers documentation <https://huggingface.co/docs/transformers/model_doc/auto#multimodal>`__ for some examples.
+
+- While implementing the :meth:`~torch.nn.Module.forward` method, reserve a keyword parameter
+  for each input tensor that corresponds to a multi-modal input, as shown in the following example:
+
+  .. code-block:: diff
+
+      def forward(
+          self,
+          input_ids: torch.Tensor,
+          positions: torch.Tensor,
+          kv_caches: List[torch.Tensor],
+          attn_metadata: AttentionMetadata,
+      +   pixel_values: torch.Tensor,
+      ) -> SamplerOutput:
 
 
-1. Register input mappers
+2. Register input mappers
 -------------------------
 
-For each modality type to support, decorate the model class with :meth:`vllm.INPUT_REGISTRY.MULTIMODAL.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+For each modality type to support, decorate the model class with :meth:`vllm.multimodal.MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
 This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
 
 .. code-block:: diff
 
-    + from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    + from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    + @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-    + @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
-    class YourModelForImage2Seq(nn.Module):
+    + @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    + @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
 
 A default mapper is available for each modality in the core vLLM library. This input mapper will be used if you do not provide your own function.
 
@@ -58,7 +69,7 @@ A default mapper is available for each modality in the core vLLM library. This i
     :ref:`input_processing_pipeline`
 
 
-2. (Optional) Register dummy data
+3. (Optional) Register dummy data
 ---------------------------------
 
 During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
@@ -67,11 +78,13 @@ In such cases, you can define your own dummy data by registering a factory metho
 .. code-block:: diff
 
     from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
     + @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
-    class YourModelForImage2Seq(nn.Module):
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
 
 Here are some examples:
 
@@ -82,21 +95,23 @@ Here are some examples:
     :ref:`input_processing_pipeline`
 
 
-3. (Optional) Register input processor
+4. (Optional) Register input processor
 --------------------------------------
 
-Sometimes, there is a need to process inputs at the :class:~vllm.LLMEngine` level before they are passed to the model executor.
+Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor.
 You can register input processors via :meth:`vllm.inputs.INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
 
 .. code-block:: diff
 
     from vllm.inputs import INPUT_REGISTRY
+    from vllm.model_executor.models.interfaces import SupportsVision
+    from vllm.multimodal import MULTIMODAL_REGISTRY
 
-    @INPUT_REGISTRY.MULTIMODAL.register_image_feature_input_mapper()
-    @INPUT_REGISTRY.MULTIMODAL.register_image_pixel_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_feature_input_mapper()
+    @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
     @INPUT_REGISTRY.register_dummy_data(<your_dummy_data_factory>)
     + @INPUT_REGISTRY.register_input_processor(<your_input_processor>)
-    class YourModelForImage2Seq(nn.Module):
+    class YourModelForImage2Seq(nn.Module, SupportsVision):
 
 A common use case of input processors is inserting placeholder tokens to leverage the vLLM framework for attention mask generation.
 Here are some examples:

From 07c08e3e3ff9b220f711caf0405bd1b91cd2d83e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 11:06:32 +0000
Subject: [PATCH 096/181] Update docs

---
 docs/source/dev/multimodal/adding_multimodal_model.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index 73de441a1574..f6d2d548a520 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -51,7 +51,7 @@ As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model
 2. Register input mappers
 -------------------------
 
-For each modality type to support, decorate the model class with :meth:`vllm.multimodal.MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
+For each modality type to support, decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_input_mapper <vllm.multimodal.MultiModalRegistry.register_input_mapper>`.
 This decorator accepts a function that maps multi-modal inputs to the keyword arguments you have previously defined in :meth:`~torch.nn.Module.forward`.
 
 .. code-block:: diff
@@ -73,7 +73,7 @@ A default mapper is available for each modality in the core vLLM library. This i
 ---------------------------------
 
 During startup, dummy data is passed to the vLLM model to allocate memory. This only consists of text input by default, which may not be applicable to multi-modal models.
-In such cases, you can define your own dummy data by registering a factory method via :meth:`vllm.inputs.INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
+In such cases, you can define your own dummy data by registering a factory method via :meth:`INPUT_REGISTRY.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`.
 
 .. code-block:: diff
 

From f3f5854c332574a68c606fb0ea043f6553503bba Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 11:06:39 +0000
Subject: [PATCH 097/181] Fix `MultiModalInputs` not working in Python 3.8

---
 vllm/multimodal/base.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index cab9b65d0362..f9dc5cc21f33 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,3 +1,4 @@
+import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
 from typing import (Callable, Dict, Generic, List, Optional, Type, TypeVar,
@@ -37,8 +38,17 @@ class MultiModalData:
 tensor; otherwise, this is a list of tensors with one element per batch.
 """
 
+if sys.version_info < (3, 9):
+    # UserDict cannot be subscripted
+    class MultiModalInputsBase(UserDict):
+        pass
+else:
 
-class MultiModalInputs(UserDict[str, torch.Tensor]):
+    class MultiModalInputsBase(UserDict[str, torch.Tensor]):
+        pass
+
+
+class MultiModalInputs(MultiModalInputsBase):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.

From bebf9e725334ccd075d9dfd051fbc5cc63a3ab0f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Thu, 27 Jun 2024 11:08:31 +0000
Subject: [PATCH 098/181] Fix `_ImageAssets` not working in Python 3.8

---
 tests/conftest.py       | 13 ++++++++++++-
 vllm/multimodal/base.py |  6 +++---
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 9d00c7676694..d300b0bf6a48 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,7 @@
 import contextlib
 import gc
 import os
+import sys
 from collections import UserList
 from dataclasses import dataclass
 from functools import cached_property
@@ -79,7 +80,17 @@ class _ImageAssetPrompts(TypedDict):
     cherry_blossom: str
 
 
-class _ImageAssets(UserList[ImageAsset]):
+if sys.version_info < (3, 9):
+    # UserList cannot be subscripted
+    class _ImageAssetsBase(UserList):
+        pass
+else:
+
+    class _ImageAssetsBase(UserList[ImageAsset]):
+        pass
+
+
+class _ImageAssets(_ImageAssetsBase):
 
     def __init__(self) -> None:
         super().__init__(
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index f9dc5cc21f33..2e19be682777 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -40,15 +40,15 @@ class MultiModalData:
 
 if sys.version_info < (3, 9):
     # UserDict cannot be subscripted
-    class MultiModalInputsBase(UserDict):
+    class _MultiModalInputsBase(UserDict):
         pass
 else:
 
-    class MultiModalInputsBase(UserDict[str, torch.Tensor]):
+    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
         pass
 
 
-class MultiModalInputs(MultiModalInputsBase):
+class MultiModalInputs(_MultiModalInputsBase):
     """
     A dictionary that represents the keyword arguments to
     :meth:`~torch.nn.Module.forward`.

From 43350b83438615587a46464d1685013965819b43 Mon Sep 17 00:00:00 2001
From: ywang96 <ywang@roblox.com>
Date: Fri, 28 Jun 2024 07:46:21 +0000
Subject: [PATCH 099/181] update example

---
 examples/phi3v_example.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index c068b9a9844e..46b7be5cd94d 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -11,7 +11,7 @@ def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
 
     # Note: The model has 128k context length by default which may cause OOM
-    # If that's the case, override `max_model_len` with a smaller value via args
+    # In this example, we override max_model_len to 2048.
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -19,6 +19,7 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
+        max_model_len=2048,
     )
 
     image = Image.open("images/cherry_blossom.jpg")

From 57791de2bc897676d54a069b7494b40c4f1591c4 Mon Sep 17 00:00:00 2001
From: ywang96 <ywang@roblox.com>
Date: Fri, 28 Jun 2024 08:01:04 +0000
Subject: [PATCH 100/181] update doc

---
 docs/source/dev/input_processing/model_inputs_index.rst | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index e9d5bf176a3c..594edeb746bb 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -6,7 +6,10 @@ Input Processing
 .. currentmodule:: vllm.inputs
 
 vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
-in :class:`~vllm.LLMEngine` before they are passed to model executors.
+in :class:`~vllm.LLMEngine` before they are passed to model executors. 
+
+Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides
 ++++++

From fbc5f704faf86bfe1d29ad8b4aecd3e382240cd0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 08:51:03 +0000
Subject: [PATCH 101/181] Update docs

---
 docs/source/dev/input_processing/model_inputs_index.rst | 2 +-
 docs/source/dev/multimodal/adding_multimodal_model.rst  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/dev/input_processing/model_inputs_index.rst b/docs/source/dev/input_processing/model_inputs_index.rst
index 594edeb746bb..2dde251aa144 100644
--- a/docs/source/dev/input_processing/model_inputs_index.rst
+++ b/docs/source/dev/input_processing/model_inputs_index.rst
@@ -8,7 +8,7 @@ Input Processing
 vLLM provides a mechanism for defining input processors for each model so that the inputs are processed
 in :class:`~vllm.LLMEngine` before they are passed to model executors. 
 
-Currently, this mechanism is only utilized in **multi-modal models** for preprocessing multi-modal input 
+Currently, this mechanism is only utilized in :ref:`multi-modal models <multi_modality>` for preprocessing multi-modal input 
 data in addition to input prompt, but it can be extended to text-only language models when needed.
 
 Guides
diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index f6d2d548a520..de95485b84a9 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -3,7 +3,7 @@
 Adding a New Multimodal Model
 =============================
 
-This document provides a high-level guide on integrating a :ref:`multimodal model <multi_modality>` into vLLM.
+This document provides a high-level guide on integrating a :ref:`multi-modal model <multi_modality>` into vLLM.
 
 .. note::
     The complexity of adding a new model depends heavily on the model's architecture.
@@ -15,8 +15,8 @@ This document provides a high-level guide on integrating a :ref:`multimodal mode
     We will be happy to help you out!
 
 
-1. Set up a base vLLM model
----------------------------
+1. Set up the base vLLM model
+-----------------------------
 
 As usual, follow :ref:`these steps <adding_a_new_model>` to implement the model in vLLM, but note the following:
 
@@ -99,7 +99,7 @@ Here are some examples:
 --------------------------------------
 
 Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor.
-You can register input processors via :meth:`vllm.inputs.INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
+You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
 
 .. code-block:: diff
 

From 5d23a96ddeba2c7b150d3ac6356693ced4324bef Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 12:19:12 +0000
Subject: [PATCH 102/181] Apply formatter

---
 vllm/multimodal/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index e751d4719650..20486f67a025 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -40,6 +40,7 @@ class MultiModalData:
 
 if sys.version_info < (3, 9):
     # UserDict cannot be subscripted
+    class _MultiModalInputsBase(UserDict):
         pass
 else:
 

From 78064e0a0bd7832e69bff53559da1747c27b9b0b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 12:39:24 +0000
Subject: [PATCH 103/181] Fix OpenAI server not working for phi3v

---
 vllm/entrypoints/openai/serving_chat.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 51b3bed6f6d9..9093b1486c3e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -99,6 +99,12 @@ def _load_chat_template(self, chat_template: Optional[str]):
 
     @cached_property
     def image_token_str(self) -> str:
+        # TODO: Let user specify how to insert image tokens into prompt
+        # (similar to chat template)
+        if self.model_config.hf_config.model_type == "phi3_v":
+            # Workaround since this token is not defined in the tokenizer
+            return "<|image_1|>"
+
         vlm_config = self.model_config.multimodal_config
         if vlm_config is None:
             raise ValueError(

From 4cb809c5b5e2b5fe6138a17e58a719f3a07d2191 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 12:51:15 +0000
Subject: [PATCH 104/181] Preemptively handle upcoming models

---
 vllm/entrypoints/openai/serving_chat.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 9093b1486c3e..abcd9e0b236e 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -98,13 +98,19 @@ def _load_chat_template(self, chat_template: Optional[str]):
                 "No chat template provided. Chat API will not work.")
 
     @cached_property
-    def image_token_str(self) -> str:
+    def image_token_str(self) -> Optional[str]:
         # TODO: Let user specify how to insert image tokens into prompt
         # (similar to chat template)
-        if self.model_config.hf_config.model_type == "phi3_v":
+        model_type = self.model_config.hf_config.model_type
+        if model_type == "phi3_v":
             # Workaround since this token is not defined in the tokenizer
             return "<|image_1|>"
+        if model_type in ("blip-2", "fuyu", "paligemma"):
+            # These models do not use image tokens in the prompt
+            return None
 
+        # The default behaviour assumes that the image token is
+        # available to the tokenizer. (Suitable for LLaVA)
         vlm_config = self.model_config.multimodal_config
         if vlm_config is None:
             raise ValueError(
@@ -140,6 +146,9 @@ def _parse_chat_message_content_parts(
                     )
 
                 image_token = self.image_token_str
+                if image_token is not None:
+                    texts.append(image_token)
+
                 image_url = cast(ChatCompletionContentPartImageParam,
                                  part)["image_url"]
 
@@ -148,7 +157,6 @@ def _parse_chat_message_content_parts(
                         "'image_url.detail' is currently not supported and "
                         "will be ignored.")
 
-                texts.append(image_token)
                 image_future = async_get_and_parse_image(image_url["url"])
                 image_futures.append(image_future)
             else:

From 754e2381ef7f97fad0280b92e09f1fcb473298e1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 13:22:53 +0000
Subject: [PATCH 105/181] Add more models

---
 vllm/entrypoints/openai/serving_chat.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index abcd9e0b236e..f70e32c517b4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -105,12 +105,14 @@ def image_token_str(self) -> Optional[str]:
         if model_type == "phi3_v":
             # Workaround since this token is not defined in the tokenizer
             return "<|image_1|>"
-        if model_type in ("blip-2", "fuyu", "paligemma"):
+        if model_type in ("blip-2", "chatglm", "fuyu", "minicpmv",
+                          "paligemma"):
             # These models do not use image tokens in the prompt
             return None
 
         # The default behaviour assumes that the image token is
-        # available to the tokenizer. (Suitable for LLaVA)
+        # available to the tokenizer.
+        # (Suitable for LLaVA, Idefics2, DeepSeek-VL)
         vlm_config = self.model_config.multimodal_config
         if vlm_config is None:
             raise ValueError(

From 9edb53c56573c9b537bd13555a2d755e91ccf5fc Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 28 Jun 2024 14:57:01 +0000
Subject: [PATCH 106/181] Update feature size for dummy data

---
 vllm/model_executor/models/llava_next.py |  8 +--
 vllm/model_executor/models/phi3v.py      | 84 +++++++++++++-----------
 2 files changed, 50 insertions(+), 42 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a4aecacb86ab..c7d072ee79c1 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -83,7 +83,7 @@ def _get_llava_next_num_unpadded_features(
 
 
 # Based on: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L111
-def _get_llava_next_image_feature_size(
+def get_llava_next_image_feature_size(
     hf_config: LlavaNextConfig,
     *,
     input_height: int,
@@ -125,9 +125,9 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
 
-    # Result in the max possible feature size
+    # Result in the max possible feature size (2x2 grid of 336x336px tiles)
     dummy_height = dummy_width = 448
-    image_feature_size = _get_llava_next_image_feature_size(
+    image_feature_size = get_llava_next_image_feature_size(
         hf_config, input_height=dummy_height, input_width=dummy_width)
 
     if isinstance(vision_config, CLIPVisionConfig):
@@ -177,7 +177,7 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
         else:
             width, height = image.size
 
-        image_feature_size = _get_llava_next_image_feature_size(
+        image_feature_size = get_llava_next_image_feature_size(
             hf_config, input_height=height, input_width=width)
     else:
         image_features = multi_modal_data.image_features
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index b39d25041b28..757d979771ef 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -266,41 +266,7 @@ class Phi3VImagePixelInputs(TypedDict):
     """
 
 
-def _get_phi3v_image_feature_size(
-    *,
-    input_height: int,
-    input_width: int,
-) -> int:
-    h, w = input_height, input_width
-
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L178
-    return (h // 336 * w // 336 + 1) * 144 + 1 + (h // 336 + 1) * 12
-
-
-def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
-    # TODO: How to get the max possible feature size?
-    dummy_height, dummy_width = 1344, 1008
-    image_feature_size = _get_phi3v_image_feature_size(
-        input_height=dummy_height,
-        input_width=dummy_width,
-    )
-
-    seq_data = dummy_seq_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        seq_len,
-        image_token_id=32044,
-        image_feature_size_override=image_feature_size,
-    )
-    mm_data = dummy_pixel_data_for_clip(
-        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
-        image_width_override=dummy_width,
-        image_height_override=dummy_height,
-    )
-
-    return seq_data, mm_data
-
-
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L57
 def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     target_height = int(np.ceil(height / padding_unit) * padding_unit)
     top_padding = int((target_height - height) / 2)
@@ -310,7 +276,7 @@ def _calc_padded_size(*, width: int, height: int, padding_unit: int = 336):
     return padded_width, padded_height
 
 
-# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L90
 def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     transposed = False
     if width < height:
@@ -335,6 +301,46 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
     return padded_width, padded_height
 
 
+# Based on https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/blob/main/image_processing_phi3_v.py#L181
+def get_phi3v_image_feature_size(
+    hf_config: PretrainedConfig,
+    *,
+    input_height: int,
+    input_width: int,
+) -> int:
+    num_crops = getattr(hf_config, "num_crops", 16)
+    new_width, new_height = _calc_hd_transform_size(width=input_width,
+                                                    height=input_height,
+                                                    hd_num=num_crops)
+
+    return (new_height // 336 * new_width // 336 + 1) * 144 + 1 \
+        + (new_height // 336 + 1) * 12
+
+
+def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
+    # Result in the max possible feature size (h:w = 16:1)
+    dummy_height, dummy_width = 8000, 50
+    image_feature_size = get_phi3v_image_feature_size(
+        ctx.get_hf_config(PretrainedConfig),
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
+
+    seq_data = dummy_seq_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        seq_len,
+        image_token_id=32044,
+        image_feature_size_override=image_feature_size,
+    )
+    mm_data = dummy_pixel_data_for_clip(
+        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
+        image_width_override=dummy_width,
+        image_height_override=dummy_height,
+    )
+
+    return seq_data, mm_data
+
+
 def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     multi_modal_data = llm_inputs.get("multi_modal_data")
     if multi_modal_data is None or not isinstance(
@@ -343,6 +349,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
     model_config = ctx.model_config
     multimodal_config = ctx.get_multimodal_config()
+    hf_config = ctx.get_hf_config(PretrainedConfig)
 
     if isinstance(multi_modal_data, ImagePixelData):
         image = multi_modal_data.image
@@ -353,8 +360,9 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
         w, h = _calc_hd_transform_size(width=w, height=h)
 
-        image_feature_size = _get_phi3v_image_feature_size(input_width=w,
-                                                           input_height=h)
+        image_feature_size = get_phi3v_image_feature_size(hf_config,
+                                                          input_width=w,
+                                                          input_height=h)
     else:
         image_features = multi_modal_data.image_features
         image_feature_size = image_features.shape[-2]

From f84b793faf61086244ac1c19b67e4932568abb01 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Fri, 28 Jun 2024 08:22:57 -0700
Subject: [PATCH 107/181] format

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 tests/multimodal/test_mapper.py            | 18 +-----------------
 vllm/model_executor/models/llava.py        |  4 +---
 vllm/model_executor/models/llava_next.py   |  4 ++--
 vllm/multimodal/__init__.py                |  2 +-
 vllm/multimodal/base.py                    |  1 -
 vllm/multimodal/registry.py                | 17 ++++++++++-------
 vllm/transformers_utils/image_processor.py |  2 --
 7 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 3a5049f1ba7c..2238b0e9f9f7 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,7 +2,7 @@
 import pytest
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import ImageData
 
@@ -12,7 +12,6 @@
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, CLIPImageProcessor)
@@ -26,11 +25,6 @@ def test_clip_image_processor(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    multimodal_config=VisionLanguageConfig(
-        image_token_id=32000,
-        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-        image_feature_size=576,
-    )
 
     for asset in image_assets:
         hf_result = hf_processor.preprocess(
@@ -57,7 +51,6 @@ def test_clip_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_llava_next_image_processor(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -71,10 +64,6 @@ def test_llava_next_image_processor(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    multimodal_config = VisionLanguageConfig(image_token_id=64000,
-                                      image_input_shape=(1, 3, IMAGE_HEIGHT,
-                                                         IMAGE_WIDTH),
-                                      image_feature_size=2928)
 
     for asset in image_assets:
         hf_result = hf_processor.preprocess(
@@ -100,7 +89,6 @@ def test_llava_next_image_processor(image_assets, dtype):
 @pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(image_assets, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -111,10 +99,6 @@ def test_image_pixel_types(image_assets, dtype):
         dtype=dtype,
         revision=None,
     )
-    multimodal_config = VisionLanguageConfig(image_token_id=32000,
-                                      image_input_shape=(1, 3, IMAGE_HEIGHT,
-                                                         IMAGE_WIDTH),
-                                      image_feature_size=576)
     for asset in image_assets:
         image_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 87dab86f4941..72d39bb124ef 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -79,7 +79,6 @@ class LlavaImagePixelInputs(TypedDict):
 
 
 def dummy_data_for_llava(ctx: InputContext, seq_len: int):
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(LlavaConfig)
     vision_config = hf_config.vision_config
 
@@ -153,14 +152,13 @@ def _parse_and_validate_image_input(
 
         if not isinstance(pixel_values, torch.Tensor):
             raise ValueError("Incorrect type of pixel values. "
-                                f"Got type: {type(pixel_values)}")
+                             f"Got type: {type(pixel_values)}")
 
         return LlavaImagePixelInputs(
             type="pixel_values",
             data=self._validate_image_data(pixel_values),
         )
 
-
     def _select_image_features(self, image_features: torch.Tensor, *,
                                strategy: str) -> torch.Tensor:
         # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index a65376eedcbe..dfdf512b968a 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -24,8 +24,8 @@
 from vllm.multimodal.image import ImageData
 from vllm.sequence import SamplerOutput
 
-from .clip import (dummy_pixel_data_for_clip,
-                   dummy_seq_data_for_clip, get_clip_patch_grid_length)
+from .clip import (dummy_pixel_data_for_clip, dummy_seq_data_for_clip,
+                   get_clip_patch_grid_length)
 from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
 
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 1d76d2d82b48..a4ccfea0783e 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,4 @@
-from .base import MultiModalData, MultiModalPlugin, EXTERNAL_MM_DATA_TYPE
+from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4fc2f480378b..6e1bf2fea385 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -36,7 +36,6 @@ class MultiModalData:
 
 EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"]
 MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
-
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 4900823b2ef7..d1fa5c1f58bf 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,7 +6,8 @@
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin, MultiModalInputMapper
+from .base import (EXTERNAL_MM_DATA_TYPE, MultiModalData,
+                   MultiModalInputMapper, MultiModalPlugin)
 from .image import ImageData, ImagePlugin
 
 logger = init_logger(__name__)
@@ -63,7 +64,8 @@ def register_image_input_mapper(
     def _process_external_input(self, key, value, model_config: ModelConfig):
         plugin = self._get_plugin_for_external_data_type(key, type(value))
         if plugin:
-            return plugin.map_input(model_config, plugin.get_internal_data_type()(value))
+            return plugin.map_input(model_config,
+                                    plugin.get_internal_data_type()(value))
         msg = f"Unknown multi-modal data type: {type(value)}"
         raise NotImplementedError(msg)
 
@@ -100,9 +102,9 @@ def register_input_mapper(
         return self._get_plugin_for_internal_data_type(data_type) \
             .register_input_mapper(mapper)
 
-    def register_image_input(
-            self,
-            mapper: Optional[MultiModalInputMapper[ImageData]] = None):
+    def register_image_input(self,
+                             mapper: Optional[
+                                 MultiModalInputMapper[ImageData]] = None):
         """
         Register an input mapper for image pixel data to a model class.
 
@@ -110,8 +112,9 @@ def register_image_input(
         """
         return self.register_input_mapper(ImageData, mapper)
 
-    def map_input(self, model_config: ModelConfig, data: Union[MultiModalData,
-                                        Dict[str, EXTERNAL_MM_DATA_TYPE]]):
+    def map_input(self, model_config: ModelConfig,
+                  data: Union[MultiModalData, Dict[str,
+                                                   EXTERNAL_MM_DATA_TYPE]]):
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index af1d772fc466..354dcb526395 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,5 +1,3 @@
-from typing import Optional
-
 from transformers import AutoImageProcessor
 from transformers.image_processing_utils import BaseImageProcessor
 

From a934663fc33561b251df1c7cbd361fe734e97014 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Fri, 28 Jun 2024 09:57:19 -0700
Subject: [PATCH 108/181] ExternalMultiModalDataDict

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 vllm/inputs/data.py         |  6 +++---
 vllm/multimodal/__init__.py |  4 ++--
 vllm/multimodal/base.py     | 18 +++++++++++++-----
 vllm/multimodal/registry.py |  7 +++----
 vllm/sequence.py            |  6 +++---
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 932b8b7b38c5..2f761057bea5 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -1,10 +1,10 @@
-from typing import (TYPE_CHECKING, Dict, List, Literal, Optional, Sequence,
+from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence,
                     TypedDict, Union, cast, overload)
 
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData
+    from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData
 
 
 class ParsedText(TypedDict):
@@ -136,7 +136,7 @@ class LLMInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
-    multi_modal_data: NotRequired[Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]]]
+    multi_modal_data: NotRequired[Optional["ExternalMultiModalDataDict"]]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index a4ccfea0783e..a9bd58a29549 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,4 @@
-from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin
+from .base import ExternalMultiModalDataDict, MultiModalData, MultiModalPlugin
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -15,5 +15,5 @@
     "MultiModalPlugin",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
-    "EXTERNAL_MM_DATA_TYPE",
+    "ExternalMultiModalDataDict",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 6e1bf2fea385..5a2e28cd7a39 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Tuple,
-                    Type, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, Optional,
+                    Tuple, Type, TypedDict, TypeVar, Union)
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
@@ -18,6 +18,8 @@ class MultiModalData:
     """
     Base class that contains multi-modal data.
 
+    This is for internal use.
+
     To add a new modality, add a new file under ``multimodal`` directory.
 
     In this new file, subclass :class:`~MultiModalData` and
@@ -34,7 +36,14 @@ class MultiModalData:
 D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
-EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"]
+
+class ExternalMultiModalDataBuiltins(TypedDict, total=False):
+    image: Union["Image.Image", "torch.Tensor"]
+
+
+ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str,
+                                                                        Any]]
+
 MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
@@ -65,8 +74,7 @@ def get_internal_data_type(self) -> Type[D]:
         raise NotImplementedError
 
     @abstractmethod
-    def get_external_data_type(
-            self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]:
+    def get_external_data_type(self) -> Tuple[str, Type[Any]]:
         """The data type that this plugin handles. 
         
         For `LLM.generate(multi_modal_data={"key": value})` will 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index d1fa5c1f58bf..a0bd960705e3 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,12 +1,12 @@
 import functools
-from typing import Any, Dict, Optional, Sequence, Type, TypeVar, Union
+from typing import Any, Optional, Sequence, Type, TypeVar, Union
 
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import (EXTERNAL_MM_DATA_TYPE, MultiModalData,
+from .base import (ExternalMultiModalDataDict, MultiModalData,
                    MultiModalInputMapper, MultiModalPlugin)
 from .image import ImageData, ImagePlugin
 
@@ -113,8 +113,7 @@ def register_image_input(self,
         return self.register_input_mapper(ImageData, mapper)
 
     def map_input(self, model_config: ModelConfig,
-                  data: Union[MultiModalData, Dict[str,
-                                                   EXTERNAL_MM_DATA_TYPE]]):
+                  data: Union[MultiModalData, ExternalMultiModalDataDict]):
         """
         Apply an input mapper to a :class:`~MultiModalData` instance passed
         to the model.
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 5cf4a71ea403..549810e8f0a3 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from vllm.inputs import LLMInputs
-    from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData
+    from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -258,7 +258,7 @@ def prompt_token_ids(self) -> List[int]:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> Dict[str, "EXTERNAL_MM_DATA_TYPE"]:
+    def multi_modal_data(self) -> "ExternalMultiModalDataDict":
         return self.inputs.get("multi_modal_data") or {}
 
     @property
@@ -617,7 +617,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]] = None,
+        multi_modal_data: Optional["ExternalMultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:

From 2144d3a013b23f15331a22c55b90f203f7d13545 Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Fri, 28 Jun 2024 12:57:40 -0700
Subject: [PATCH 109/181] mention schema

---
 docs/source/models/vlm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index d4613be64aa5..639e7a9a9284 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This is a dictionary that contains multi-modal data.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`ExternalMultiModalDataDict`.
 
 .. code-block:: python
 

From 2795b16f5b8e9ccf6dbd9b2f57404d8cd3967bbf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 03:07:41 +0000
Subject: [PATCH 110/181] Use a less strict check

---
 vllm/worker/model_runner.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index cf557f2ff745..47c01dfb7d12 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -752,8 +752,10 @@ def profile_run(self) -> None:
 
             seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
                 .dummy_data_for_profiling(model_config, seq_len)
-            assert len(seq_data.prompt_token_ids) == seq_len, (
-                f"Wrong number of tokens generated. Expected: {seq_len} "
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
                 f"but got: {len(seq_data.prompt_token_ids)}")
 
             seq = SequenceGroupMetadata(

From 86ffd6014f3d1954e54189347b6ca85c2137fdf8 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 04:26:41 +0000
Subject: [PATCH 111/181] Fix phi3v test

---
 tests/models/test_phi3v.py | 38 +++++++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 17 deletions(-)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 875076c8cffa..88be80cee5d4 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -3,8 +3,10 @@
 from typing import List, Optional, Tuple
 
 import pytest
+from transformers import AutoConfig, AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.model_executor.models.phi3v import get_phi3v_image_feature_size
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import rescale_image_size
 from vllm.utils import is_cpu
@@ -46,29 +48,23 @@ def iter_phi3v_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str):
+                      vlm_config: VisionLanguageConfig, model_id: str,
+                      image_feature_size: int):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
     output_ids, output_str = vllm_output
-    image_token_id = vlm_config.image_token_id
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", " ", output_str)
 
-    hf_output_ids: List[int] = []
-    for idx, token_id in enumerate(output_ids):
-        if token_id != image_token_id:
-            hf_output_ids.append(token_id)
-        else:
-            hf_output_ids.append(0)
-
-            if output_ids[idx + 1] != image_token_id:
-                hf_output_ids.extend([1, 29871])
-
-    hf_output_str = output_str.replace("<|user|>", "") \
+    hf_output_str = output_str_without_image.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
-    hf_output_str = re.sub(r"(<\|image\|>)+", " ", hf_output_str)
-    hf_output_str = re.sub(r"(<\|image_\d+\|>)+", " ", hf_output_str)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    hf_output_ids = tokenizer.encode(output_str_without_image)
+    hf_output_ids = hf_output_ids[:4] + [0] * image_feature_size \
+        + [1] + hf_output_ids[4:]
 
     return hf_output_ids, hf_output_str
 
@@ -94,6 +90,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
+    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
@@ -111,7 +109,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
-                     max_model_len=2048,
+                     max_model_len=4096,
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
@@ -140,10 +138,16 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     # difference for longer context (max_tokens=128) and test can't pass
     best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
     for i in range(len(HF_IMAGE_PROMPTS)):
+        image_feature_size = get_phi3v_image_feature_size(
+            hf_config,
+            input_height=hf_image_inputs[i].height,
+            input_width=hf_image_inputs[i].width,
+        )
+
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
             vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                vllm_outputs[i], vlm_config, model_id)
+                vllm_outputs[i], vlm_config, model_id, image_feature_size)
             assert hf_output_str == vllm_output_str, (
                 f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
             assert hf_output_ids == vllm_output_ids, (

From f339dd1f58a86844cfc327f198a35810cc7569d9 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 04:27:12 +0000
Subject: [PATCH 112/181] Update default length as the dummy image feature size
 is increased

---
 examples/phi3v_example.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index c8455da30e9b..c4b3c488faa1 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -11,7 +11,8 @@ def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
 
     # Note: The model has 128k context length by default which may cause OOM
-    # In this example, we override max_model_len to 2048.
+    # In this example, we override max_model_len to 4096
+    # (>= image_feature_size).
     llm = LLM(
         model=model_path,
         trust_remote_code=True,
@@ -19,7 +20,7 @@ def run_phi3v():
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
         image_feature_size=1921,
-        max_model_len=2048,
+        max_model_len=4096,
     )
 
     image = Image.open("images/cherry_blossom.jpg")

From 59a7a4cea225934a93aaecfebb0ff9c92a3ec79b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 04:28:45 +0000
Subject: [PATCH 113/181] Raise full error if output is completely different

---
 tests/models/test_llava.py      | 5 +++--
 tests/models/test_llava_next.py | 5 +++--
 tests/models/test_phi3v.py      | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index edf037d0a33a..7041570cb6ed 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -139,9 +139,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             best_max_tokens_exc_list.append((max_tokens, None))
 
     best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+    if best_max_tokens < 1:
+        raise next(exc for exc in exc_list if exc is not None)
     if best_max_tokens < max_tokens:
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-
         pytest.xfail(
             f"Test only fully passes when max_tokens={best_max_tokens} "
             f"(instead of {max_tokens}). Errors encountered per item: "
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 8d5256841be0..8cf1f79a7a7b 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -149,9 +149,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             best_max_tokens_exc_list.append((max_tokens, None))
 
     best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+    if best_max_tokens < 1:
+        raise next(exc for exc in exc_list if exc is not None)
     if best_max_tokens < max_tokens:
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-
         pytest.xfail(
             f"Test only fully passes when max_tokens={best_max_tokens} "
             f"(instead of {max_tokens}). Errors encountered per item: "
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 88be80cee5d4..1b22a3b02470 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -165,9 +165,10 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             best_max_tokens_exc_list.append((max_tokens, None))
 
     best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+    if best_max_tokens < 1:
+        raise next(exc for exc in exc_list if exc is not None)
     if best_max_tokens < max_tokens:
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-
         pytest.xfail(
             f"Test only fully passes when max_tokens={best_max_tokens} "
             f"(instead of {max_tokens}). Errors encountered per item: "

From 62952e1941695ddc3055e6a21383ddbb9f41a400 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 04:42:06 +0000
Subject: [PATCH 114/181] Fix phi3v not using input processor

---
 vllm/model_executor/models/phi3v.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 757d979771ef..5c0fde9602e6 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -405,6 +405,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
 @MULTIMODAL_REGISTRY.register_image_pixel_input_mapper()
 @INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
+@INPUT_REGISTRY.register_input_processor(input_processor_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):
 
     def __init__(self,

From 0ce3ecbfea830ada57e1fe63a72edbce0e37e4d3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 07:02:13 +0000
Subject: [PATCH 115/181] Move size factors outside

---
 tests/models/test_llava.py      | 12 +++++++++---
 tests/models/test_llava_next.py | 12 +++++++++---
 tests/models/test_phi3v.py      | 12 +++++++++---
 3 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 7041570cb6ed..b99eef8ebd00 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -67,11 +67,18 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("size_factors", [
+    # Single-scale
+    [1.0],
+    # Single-scale, batched
+    [1.0, 1.0, 1.0],
+    # Multi-scale
+    [0.25, 0.5, 1.0],
+])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int, is_multiscale: bool) -> None:
+                size_factors, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -85,7 +92,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 8cf1f79a7a7b..cc3a1adaf097 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -76,11 +76,18 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("size_factors", [
+    # Single-scale
+    [1.0],
+    # Single-scale, batched
+    [1.0, 1.0, 1.0],
+    # Multi-scale
+    [0.25, 0.5, 1.0],
+])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int, is_multiscale: bool) -> None:
+                size_factors, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -94,7 +101,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1b22a3b02470..68a45bbe309b 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -75,11 +75,18 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("size_factors", [
+    # Single-scale
+    [1.0],
+    # Single-scale, batched
+    [1.0, 1.0, 1.0],
+    # Multi-scale
+    [0.25, 0.5, 1.0],
+])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("is_multiscale", [True, False])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                dtype: str, max_tokens: int, is_multiscale: bool) -> None:
+                size_factors, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -95,7 +102,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    size_factors = (0.25, 0.5, 1.0) if is_multiscale else (1, )
     image_inputs = [
         (rescale_image_size(hf_image, factor),
          ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),

From b43e8c35a7a01a11be90554b5fb0330ee095f6a1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 12:48:46 +0000
Subject: [PATCH 116/181] Apply formatter

---
 tests/models/test_llava.py      | 18 ++++++++++--------
 tests/models/test_llava_next.py | 18 ++++++++++--------
 tests/models/test_phi3v.py      | 18 ++++++++++--------
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index b99eef8ebd00..c447eea4796d 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -67,14 +67,16 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("size_factors", [
-    # Single-scale
-    [1.0],
-    # Single-scale, batched
-    [1.0, 1.0, 1.0],
-    # Multi-scale
-    [0.25, 0.5, 1.0],
-])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index cc3a1adaf097..46dc434a7233 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -76,14 +76,16 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("size_factors", [
-    # Single-scale
-    [1.0],
-    # Single-scale, batched
-    [1.0, 1.0, 1.0],
-    # Multi-scale
-    [0.25, 0.5, 1.0],
-])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 68a45bbe309b..0b1e40ccdc6d 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -75,14 +75,16 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
-@pytest.mark.parametrize("size_factors", [
-    # Single-scale
-    [1.0],
-    # Single-scale, batched
-    [1.0, 1.0, 1.0],
-    # Multi-scale
-    [0.25, 0.5, 1.0],
-])
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ])
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,

From 902379438800e9b506c140dca6a37e29f9988a8e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sat, 29 Jun 2024 13:20:23 +0000
Subject: [PATCH 117/181] Fix some outputs not being checked

---
 tests/models/test_llava.py      | 2 +-
 tests/models/test_llava_next.py | 2 +-
 tests/models/test_phi3v.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c447eea4796d..725858c4137a 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -125,7 +125,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     # There may be numeric differences for multiscale images due to
     # our implementation of CLIPVisionModel
     best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(HF_IMAGE_PROMPTS)):
+    for i in range(len(image_inputs)):
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
             vllm_output_ids, vllm_output_str = vllm_to_hf_output(
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 46dc434a7233..f0587371a569 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -135,7 +135,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     # There may be numeric differences for multiscale images due to
     # our implementation of CLIPVisionModel
     best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(HF_IMAGE_PROMPTS)):
+    for i in range(len(image_inputs)):
         try:
             hf_output_ids, hf_output_str = hf_outputs[i]
             vllm_output_ids, vllm_output_str = vllm_to_hf_output(
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 0b1e40ccdc6d..ff00dfff8d73 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -145,7 +145,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     # Since we use _attn_implementation="eager", there is numeric
     # difference for longer context (max_tokens=128) and test can't pass
     best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(HF_IMAGE_PROMPTS)):
+    for i in range(len(image_inputs)):
         image_feature_size = get_phi3v_image_feature_size(
             hf_config,
             input_height=hf_image_inputs[i].height,

From f6c80616861704f4e2a67ef09bc0c1e634ea0cc2 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 01:29:39 +0000
Subject: [PATCH 118/181] Also test no image

---
 tests/models/test_llava.py      | 2 ++
 tests/models/test_llava_next.py | 5 ++++-
 tests/models/test_phi3v.py      | 5 ++++-
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c2c099ebfb53..410eedbe3ed0 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -160,6 +160,8 @@ def run_test(
 @pytest.mark.parametrize(
     "size_factors",
     [
+        # No image
+        [],
         # Single-scale
         [1.0],
         # Single-scale, batched
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index f0587371a569..b8ddea66901c 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -79,13 +79,16 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize(
     "size_factors",
     [
+        # No image
+        [],
         # Single-scale
         [1.0],
         # Single-scale, batched
         [1.0, 1.0, 1.0],
         # Multi-scale
         [0.25, 0.5, 1.0],
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 80b7afa0ecb3..883bf1bdbcd7 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -184,13 +184,16 @@ def run_test(
 @pytest.mark.parametrize(
     "size_factors",
     [
+        # No image
+        [],
         # Single-scale
         [1.0],
         # Single-scale, batched
         [1.0, 1.0, 1.0],
         # Multi-scale
         [0.25, 0.5, 1.0],
-    ])
+    ],
+)
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,

From 235c8a96d6200e619d791451bfd3bd2688d9eaa0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 04:19:53 +0000
Subject: [PATCH 119/181] Batch by size factors

---
 tests/conftest.py               |   9 +-
 tests/models/test_llava.py      | 115 +++++++++++++++-----------
 tests/models/test_llava_next.py | 115 +++++++++++++++-----------
 tests/models/test_phi3v.py      | 141 ++++++++++++++++++--------------
 4 files changed, 214 insertions(+), 166 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 7706b9dfd459..af1766d5a98d 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,8 +6,9 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
-                    TypeVar)
+from typing import Any, Dict, List, Literal, Optional
+from typing import Sequence as GenericSequence
+from typing import Tuple, TypedDict, TypeVar
 
 import pytest
 import torch
@@ -453,7 +454,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[GenericSequence[MultiModalData]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -502,7 +503,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[GenericSequence[MultiModalData]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 9b80e21df08f..ecac6e26a059 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -92,16 +92,20 @@ def run_test(
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    image_inputs = [
-        (rescale_image_size(hf_image, factor),
-         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-         prompt) for hf_image, vllm_image, prompt in zip(
-             hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in size_factors
-    ]
-    prompt_inputs = [prompt for _, _, prompt in image_inputs]
-    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
-    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
+    image_inputs_per_size_factors = [[(
+        prompt,
+        rescale_image_size(hf_image, factor),
+        ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
+    ) for hf_image, vllm_image, prompt in zip(
+        hf_images, vllm_images, HF_IMAGE_PROMPTS)] for factor in size_factors]
+    hf_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [hf_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
+    vllm_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [vllm_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
@@ -110,51 +114,62 @@ def run_test(
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
-                                                  max_tokens,
-                                                  images=vllm_image_inputs)
+        vllm_outputs_per_size_factors = [
+            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            for prompts, vllm_images in vllm_inputs_per_size_factors
+        ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(prompt_inputs,
-                                              max_tokens,
-                                              images=hf_image_inputs)
-        hf_dummy_outputs = hf_model.generate_greedy(prompt_inputs,
-                                                    max_tokens=1,
-                                                    images=hf_image_inputs)
+        hf_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts, max_tokens, images=hf_images)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
+        hf_dummy_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts, max_tokens=1, images=hf_images)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
 
     # There may be numeric differences for multiscale images due to
     # our implementation of CLIPVisionModel
-    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(image_inputs)):
-        try:
-            hf_output_ids, hf_output_str = hf_outputs[i]
-            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                vllm_outputs[i], vlm_config, model_id)
-            assert hf_output_str == vllm_output_str, (
-                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-            assert hf_output_ids == vllm_output_ids, (
-                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except AssertionError as e:
-            num_match_tokens = sum(1 for _ in itertools.takewhile(
-                lambda pair: pair[0] == pair[1],
-                zip(hf_output_ids, vllm_output_ids),
-            ))
-            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-            best_max_tokens = num_match_tokens - num_prefix_tokens
-            best_max_tokens_exc_list.append((best_max_tokens, e))
-        else:
-            best_max_tokens_exc_list.append((max_tokens, None))
-
-    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-    if best_max_tokens < 1:
-        raise next(exc for exc in exc_list if exc is not None)
-    if best_max_tokens < max_tokens:
-        pytest.xfail(
-            f"Test only fully passes when max_tokens={best_max_tokens} "
-            f"(instead of {max_tokens}). Errors encountered per item: "
-            f"{exc_list}")
+    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
+            image_inputs_per_size_factors,
+            vllm_outputs_per_size_factors,
+            hf_outputs_per_size_factors,
+            hf_dummy_outputs_per_size_factors,
+    ):
+        best_max_tokens_exc_list: List[Tuple[int,
+                                             Optional[AssertionError]]] = []
+        for i in range(len(image_inputs)):
+            try:
+                hf_output_ids, hf_output_str = hf_outputs[i]
+                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                    vllm_outputs[i], vlm_config, model_id)
+                assert hf_output_str == vllm_output_str, (
+                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+                )
+                assert hf_output_ids == vllm_output_ids, (
+                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+            except AssertionError as e:
+                num_match_tokens = sum(1 for _ in itertools.takewhile(
+                    lambda pair: pair[0] == pair[1],
+                    zip(hf_output_ids, vllm_output_ids),
+                ))
+                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+                best_max_tokens = num_match_tokens - num_prefix_tokens
+                best_max_tokens_exc_list.append((best_max_tokens, e))
+            else:
+                best_max_tokens_exc_list.append((max_tokens, None))
+
+        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+        if best_max_tokens < 1:
+            raise next(exc for exc in exc_list if exc is not None)
+        if best_max_tokens < max_tokens:
+            pytest.xfail(
+                f"Test only fully passes when max_tokens={best_max_tokens} "
+                f"(instead of {max_tokens}). Errors encountered per item: "
+                f"{exc_list}")
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 55f5be04f1bb..ed71560ff035 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -107,16 +107,20 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    image_inputs = [
-        (rescale_image_size(hf_image, factor),
-         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-         prompt) for hf_image, vllm_image, prompt in zip(
-             hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in size_factors
-    ]
-    prompt_inputs = [prompt for _, _, prompt in image_inputs]
-    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
-    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
+    image_inputs_per_size_factors = [[(
+        prompt,
+        rescale_image_size(hf_image, factor),
+        ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
+    ) for hf_image, vllm_image, prompt in zip(
+        hf_images, vllm_images, HF_IMAGE_PROMPTS)] for factor in size_factors]
+    hf_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [hf_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
+    vllm_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [vllm_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
@@ -124,48 +128,59 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                      max_model_len=4096,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
-                                                  max_tokens,
-                                                  images=vllm_image_inputs)
+        vllm_outputs_per_size_factors = [
+            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            for prompts, vllm_images in vllm_inputs_per_size_factors
+        ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs = hf_model.generate_greedy(prompt_inputs,
-                                              max_tokens,
-                                              images=hf_image_inputs)
-        hf_dummy_outputs = hf_model.generate_greedy(prompt_inputs,
-                                                    max_tokens=1,
-                                                    images=hf_image_inputs)
+        hf_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts, max_tokens, images=hf_images)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
+        hf_dummy_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts, max_tokens=1, images=hf_images)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
 
     # There may be numeric differences for multiscale images due to
     # our implementation of CLIPVisionModel
-    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(image_inputs)):
-        try:
-            hf_output_ids, hf_output_str = hf_outputs[i]
-            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                vllm_outputs[i], vlm_config, model_id)
-            assert hf_output_str == vllm_output_str, (
-                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-            assert hf_output_ids == vllm_output_ids, (
-                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except AssertionError as e:
-            num_match_tokens = sum(1 for _ in itertools.takewhile(
-                lambda pair: pair[0] == pair[1],
-                zip(hf_output_ids, vllm_output_ids),
-            ))
-            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-            best_max_tokens = num_match_tokens - num_prefix_tokens
-            best_max_tokens_exc_list.append((best_max_tokens, e))
-        else:
-            best_max_tokens_exc_list.append((max_tokens, None))
-
-    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-    if best_max_tokens < 1:
-        raise next(exc for exc in exc_list if exc is not None)
-    if best_max_tokens < max_tokens:
-        pytest.xfail(
-            f"Test only fully passes when max_tokens={best_max_tokens} "
-            f"(instead of {max_tokens}). Errors encountered per item: "
-            f"{exc_list}")
+    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
+            image_inputs_per_size_factors,
+            vllm_outputs_per_size_factors,
+            hf_outputs_per_size_factors,
+            hf_dummy_outputs_per_size_factors,
+    ):
+        best_max_tokens_exc_list: List[Tuple[int,
+                                             Optional[AssertionError]]] = []
+        for i in range(len(image_inputs)):
+            try:
+                hf_output_ids, hf_output_str = hf_outputs[i]
+                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                    vllm_outputs[i], vlm_config, model_id)
+                assert hf_output_str == vllm_output_str, (
+                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+                )
+                assert hf_output_ids == vllm_output_ids, (
+                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+            except AssertionError as e:
+                num_match_tokens = sum(1 for _ in itertools.takewhile(
+                    lambda pair: pair[0] == pair[1],
+                    zip(hf_output_ids, vllm_output_ids),
+                ))
+                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+                best_max_tokens = num_match_tokens - num_prefix_tokens
+                best_max_tokens_exc_list.append((best_max_tokens, e))
+            else:
+                best_max_tokens_exc_list.append((max_tokens, None))
+
+        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+        if best_max_tokens < 1:
+            raise next(exc for exc in exc_list if exc is not None)
+        if best_max_tokens < max_tokens:
+            pytest.xfail(
+                f"Test only fully passes when max_tokens={best_max_tokens} "
+                f"(instead of {max_tokens}). Errors encountered per item: "
+                f"{exc_list}")
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index b56dcc406f17..a7431a3aefd2 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -102,16 +102,20 @@ def run_test(
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    image_inputs = [
-        (rescale_image_size(hf_image, factor),
-         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-         prompt) for hf_image, vllm_image, prompt in zip(
-             hf_images, vllm_images, HF_IMAGE_PROMPTS)
-        for factor in size_factors
-    ]
-    prompt_inputs = [prompt for _, _, prompt in image_inputs]
-    hf_image_inputs = [hf_image for hf_image, _, _ in image_inputs]
-    vllm_image_inputs = [vllm_image for _, vllm_image, _ in image_inputs]
+    image_inputs_per_size_factors = [[(
+        prompt,
+        rescale_image_size(hf_image, factor),
+        ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
+    ) for hf_image, vllm_image, prompt in zip(
+        hf_images, vllm_images, HF_IMAGE_PROMPTS)] for factor in size_factors]
+    hf_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [hf_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
+    vllm_inputs_per_size_factors = [(
+        [prompt for prompt, hf_image, vllm_image in image_inputs],
+        [vllm_image for prompt, hf_image, vllm_image in image_inputs],
+    ) for image_inputs in image_inputs_per_size_factors]
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
@@ -119,66 +123,79 @@ def run_test(
                      dtype=dtype,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs = vllm_model.generate_greedy(prompt_inputs,
-                                                  max_tokens,
-                                                  images=vllm_image_inputs)
+        vllm_outputs_per_size_factors = [
+            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            for prompts, vllm_images in vllm_inputs_per_size_factors
+        ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
     hf_model_kwargs = {"_attn_implementation": "eager"}
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
-        hf_outputs = hf_model.generate_greedy(
-            prompt_inputs,
-            max_tokens,
-            images=hf_image_inputs,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id,
-        )
-        hf_dummy_outputs = hf_model.generate_greedy(
-            prompt_inputs,
-            max_tokens=1,
-            images=hf_image_inputs,
-            eos_token_id=hf_model.processor.tokenizer.eos_token_id,
-        )
+        eos_token_id = hf_model.processor.tokenizer.eos_token_id
+        hf_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts,
+                                     max_tokens,
+                                     images=hf_images,
+                                     eos_token_id=eos_token_id)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
+        hf_dummy_outputs_per_size_factors = [
+            hf_model.generate_greedy(prompts,
+                                     max_tokens=1,
+                                     images=hf_images,
+                                     eos_token_id=eos_token_id)
+            for prompts, hf_images in hf_inputs_per_size_factors
+        ]
 
     # Since we use _attn_implementation="eager", there is numeric
     # difference for longer context (max_tokens=128) and test can't pass
-    best_max_tokens_exc_list: List[Tuple[int, Optional[AssertionError]]] = []
-    for i in range(len(image_inputs)):
-        image_feature_size = get_phi3v_image_feature_size(
-            hf_config,
-            input_height=hf_image_inputs[i].height,
-            input_width=hf_image_inputs[i].width,
-        )
-
-        try:
-            hf_output_ids, hf_output_str = hf_outputs[i]
-            vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                vllm_outputs[i], vlm_config, model_id, image_feature_size)
-            assert hf_output_str == vllm_output_str, (
-                f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
-            assert hf_output_ids == vllm_output_ids, (
-                f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-        except AssertionError as e:
-            num_match_tokens = sum(1 for _ in itertools.takewhile(
-                lambda pair: pair[0] == pair[1],
-                zip(hf_output_ids, vllm_output_ids),
-            ))
-            num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-            best_max_tokens = num_match_tokens - num_prefix_tokens
-            best_max_tokens_exc_list.append((best_max_tokens, e))
-        else:
-            best_max_tokens_exc_list.append((max_tokens, None))
-
-    best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-    exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-    if best_max_tokens < 1:
-        raise next(exc for exc in exc_list if exc is not None)
-    if best_max_tokens < max_tokens:
-        pytest.xfail(
-            f"Test only fully passes when max_tokens={best_max_tokens} "
-            f"(instead of {max_tokens}). Errors encountered per item: "
-            f"{exc_list}")
+    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
+            image_inputs_per_size_factors,
+            vllm_outputs_per_size_factors,
+            hf_outputs_per_size_factors,
+            hf_dummy_outputs_per_size_factors,
+    ):
+        best_max_tokens_exc_list: List[Tuple[int,
+                                             Optional[AssertionError]]] = []
+        for i in range(len(image_inputs)):
+            _, hf_image, _ = image_inputs[i]
+            image_feature_size = get_phi3v_image_feature_size(
+                hf_config,
+                input_height=hf_image.height,
+                input_width=hf_image.width,
+            )
+
+            try:
+                hf_output_ids, hf_output_str = hf_outputs[i]
+                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+                    vllm_outputs[i], vlm_config, model_id, image_feature_size)
+                assert hf_output_str == vllm_output_str, (
+                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
+                )
+                assert hf_output_ids == vllm_output_ids, (
+                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+            except AssertionError as e:
+                num_match_tokens = sum(1 for _ in itertools.takewhile(
+                    lambda pair: pair[0] == pair[1],
+                    zip(hf_output_ids, vllm_output_ids),
+                ))
+                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
+
+                best_max_tokens = num_match_tokens - num_prefix_tokens
+                best_max_tokens_exc_list.append((best_max_tokens, e))
+            else:
+                best_max_tokens_exc_list.append((max_tokens, None))
+
+        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
+        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
+        if best_max_tokens < 1:
+            raise next(exc for exc in exc_list if exc is not None)
+        if best_max_tokens < max_tokens:
+            pytest.xfail(
+                f"Test only fully passes when max_tokens={best_max_tokens} "
+                f"(instead of {max_tokens}). Errors encountered per item: "
+                f"{exc_list}")
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)

From b98d92487a7e8e1d7c025bbd751a1637f3e13be3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 06:06:36 +0000
Subject: [PATCH 120/181] Factor out xfail code

---
 tests/models/test_llava.py      | 51 +++++++-------------
 tests/models/test_llava_next.py | 53 +++++++--------------
 tests/models/test_phi3v.py      | 64 ++++++++++---------------
 tests/models/utils.py           | 84 ++++++++++++++++++++++++++++++---
 4 files changed, 136 insertions(+), 116 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index ecac6e26a059..c51b09bce97c 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,4 +1,3 @@
-import itertools
 from typing import List, Optional, Tuple, Type
 
 import pytest
@@ -9,7 +8,7 @@
 from vllm.multimodal.utils import rescale_image_size
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from .utils import check_outputs_equal_xfail
 
 pytestmark = pytest.mark.vlm
 
@@ -137,39 +136,21 @@ def run_test(
             hf_outputs_per_size_factors,
             hf_dummy_outputs_per_size_factors,
     ):
-        best_max_tokens_exc_list: List[Tuple[int,
-                                             Optional[AssertionError]]] = []
-        for i in range(len(image_inputs)):
-            try:
-                hf_output_ids, hf_output_str = hf_outputs[i]
-                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                    vllm_outputs[i], vlm_config, model_id)
-                assert hf_output_str == vllm_output_str, (
-                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-                )
-                assert hf_output_ids == vllm_output_ids, (
-                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-            except AssertionError as e:
-                num_match_tokens = sum(1 for _ in itertools.takewhile(
-                    lambda pair: pair[0] == pair[1],
-                    zip(hf_output_ids, vllm_output_ids),
-                ))
-                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-                best_max_tokens = num_match_tokens - num_prefix_tokens
-                best_max_tokens_exc_list.append((best_max_tokens, e))
-            else:
-                best_max_tokens_exc_list.append((max_tokens, None))
-
-        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-        if best_max_tokens < 1:
-            raise next(exc for exc in exc_list if exc is not None)
-        if best_max_tokens < max_tokens:
-            pytest.xfail(
-                f"Test only fully passes when max_tokens={best_max_tokens} "
-                f"(instead of {max_tokens}). Errors encountered per item: "
-                f"{exc_list}")
+        check_outputs_equal_xfail(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
+            ],
+            outputs_num_prefix_tokens=[
+                len(hf_dummy_output[0]) - 1
+                for hf_dummy_output in hf_dummy_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            min_tokens_to_xfail=1,
+            min_tokens_to_pass=max_tokens,
+        )
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index ed71560ff035..bd42d09dc046 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,6 +1,5 @@
-import itertools
 import re
-from typing import List, Optional, Tuple
+from typing import List, Tuple
 
 import pytest
 from transformers import AutoTokenizer
@@ -10,7 +9,7 @@
 from vllm.multimodal.utils import rescale_image_size
 
 from ..conftest import IMAGE_ASSETS
-from .utils import check_outputs_equal
+from .utils import check_outputs_equal_xfail
 
 pytestmark = pytest.mark.vlm
 
@@ -151,36 +150,18 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             hf_outputs_per_size_factors,
             hf_dummy_outputs_per_size_factors,
     ):
-        best_max_tokens_exc_list: List[Tuple[int,
-                                             Optional[AssertionError]]] = []
-        for i in range(len(image_inputs)):
-            try:
-                hf_output_ids, hf_output_str = hf_outputs[i]
-                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                    vllm_outputs[i], vlm_config, model_id)
-                assert hf_output_str == vllm_output_str, (
-                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-                )
-                assert hf_output_ids == vllm_output_ids, (
-                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-            except AssertionError as e:
-                num_match_tokens = sum(1 for _ in itertools.takewhile(
-                    lambda pair: pair[0] == pair[1],
-                    zip(hf_output_ids, vllm_output_ids),
-                ))
-                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-                best_max_tokens = num_match_tokens - num_prefix_tokens
-                best_max_tokens_exc_list.append((best_max_tokens, e))
-            else:
-                best_max_tokens_exc_list.append((max_tokens, None))
-
-        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-        if best_max_tokens < 1:
-            raise next(exc for exc in exc_list if exc is not None)
-        if best_max_tokens < max_tokens:
-            pytest.xfail(
-                f"Test only fully passes when max_tokens={best_max_tokens} "
-                f"(instead of {max_tokens}). Errors encountered per item: "
-                f"{exc_list}")
+        check_outputs_equal_xfail(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
+            ],
+            outputs_num_prefix_tokens=[
+                len(hf_dummy_output[0]) - 1
+                for hf_dummy_output in hf_dummy_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            min_tokens_to_xfail=1,
+            min_tokens_to_pass=max_tokens,
+        )
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index a7431a3aefd2..c471154bdf39 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,4 +1,3 @@
-import itertools
 import re
 from typing import List, Optional, Tuple, Type
 
@@ -12,7 +11,7 @@
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal
+from .utils import check_outputs_equal_xfail
 
 pytestmark = pytest.mark.vlm
 
@@ -156,46 +155,33 @@ def run_test(
             hf_outputs_per_size_factors,
             hf_dummy_outputs_per_size_factors,
     ):
-        best_max_tokens_exc_list: List[Tuple[int,
-                                             Optional[AssertionError]]] = []
-        for i in range(len(image_inputs)):
-            _, hf_image, _ = image_inputs[i]
-            image_feature_size = get_phi3v_image_feature_size(
+        image_feature_sizes = [
+            get_phi3v_image_feature_size(
                 hf_config,
                 input_height=hf_image.height,
                 input_width=hf_image.width,
-            )
-
-            try:
-                hf_output_ids, hf_output_str = hf_outputs[i]
-                vllm_output_ids, vllm_output_str = vllm_to_hf_output(
-                    vllm_outputs[i], vlm_config, model_id, image_feature_size)
-                assert hf_output_str == vllm_output_str, (
-                    f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}"
-                )
-                assert hf_output_ids == vllm_output_ids, (
-                    f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-            except AssertionError as e:
-                num_match_tokens = sum(1 for _ in itertools.takewhile(
-                    lambda pair: pair[0] == pair[1],
-                    zip(hf_output_ids, vllm_output_ids),
-                ))
-                num_prefix_tokens = len(hf_dummy_outputs[i][0]) - 1
-
-                best_max_tokens = num_match_tokens - num_prefix_tokens
-                best_max_tokens_exc_list.append((best_max_tokens, e))
-            else:
-                best_max_tokens_exc_list.append((max_tokens, None))
-
-        best_max_tokens = min(pair[0] for pair in best_max_tokens_exc_list)
-        exc_list = [pair[1] for pair in best_max_tokens_exc_list]
-        if best_max_tokens < 1:
-            raise next(exc for exc in exc_list if exc is not None)
-        if best_max_tokens < max_tokens:
-            pytest.xfail(
-                f"Test only fully passes when max_tokens={best_max_tokens} "
-                f"(instead of {max_tokens}). Errors encountered per item: "
-                f"{exc_list}")
+            ) for _, hf_image, _ in image_inputs
+        ]
+
+        check_outputs_equal_xfail(
+            outputs_0_lst=hf_outputs,
+            outputs_1_lst=[
+                vllm_to_hf_output(vllm_output,
+                                  vlm_config,
+                                  model_id,
+                                  image_feature_size=image_feature_size)
+                for vllm_output, image_feature_size in zip(
+                    vllm_outputs, image_feature_sizes)
+            ],
+            outputs_num_prefix_tokens=[
+                len(hf_dummy_output[0]) - 1
+                for hf_dummy_output in hf_dummy_outputs
+            ],
+            name_0="hf",
+            name_1="vllm",
+            min_tokens_to_xfail=1,
+            min_tokens_to_pass=max_tokens,
+        )
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 0d5e304d8446..eaa550945220 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,11 +1,18 @@
+import itertools
 from typing import Dict, List, Tuple
 
+import pytest
+
 TokensText = Tuple[List[int], str]
 
 
-def check_outputs_equal(outputs_0_lst: List[TokensText],
-                        outputs_1_lst: List[TokensText], name_0: str,
-                        name_1: str):
+def check_outputs_equal(
+    *,
+    outputs_0_lst: List[TokensText],
+    outputs_1_lst: List[TokensText],
+    name_0: str,
+    name_1: str,
+):
     """
     Compare the two sequences generated by different models, 
     which should be equal.
@@ -26,12 +33,77 @@ def check_outputs_equal(outputs_0_lst: List[TokensText],
                                               f"\n{name_1}:\t{output_str_1!r}")
 
 
+def check_outputs_equal_xfail(
+    *,
+    outputs_0_lst: List[TokensText],
+    outputs_1_lst: List[TokensText],
+    outputs_num_prefix_tokens: List[int],
+    name_0: str,
+    name_1: str,
+    min_tokens_to_xfail: int,
+    min_tokens_to_pass: int,
+):
+    """
+    Compare the two sequences generated by different models, 
+    which should be equal, but we xfail instead of completely failing
+    the test if the outputs are equal up to a certain number of
+    `min_tokens_to_xfail` after omitting the first `num_prefix_tokens`.
+    """
+    assert len(outputs_0_lst) == len(outputs_1_lst)
+    assert len(outputs_0_lst) == len(outputs_num_prefix_tokens)
+
+    num_tokens_to_pass_exc_list: List[Tuple[int, AssertionError]] = []
+    for prompt_idx, (outputs_0, outputs_1, num_prefix_tokens) in enumerate(
+            zip(outputs_0_lst, outputs_1_lst, outputs_num_prefix_tokens)):
+
+        output_ids_0, output_str_0 = outputs_0
+        output_ids_1, output_str_1 = outputs_1
+
+        try:
+            assert output_str_0 == output_str_1, (
+                f"Test{prompt_idx}:"
+                f"\n{name_0}:\t{output_str_0!r}"
+                f"\n{name_1}:\t{output_str_1!r}")
+            assert output_ids_0 == output_ids_1, (
+                f"Test{prompt_idx}:"
+                f"\n{name_0}:\t{output_str_0!r}"
+                f"\n{name_1}:\t{output_str_1!r}")
+        except AssertionError as e:
+            num_tokens_to_pass = sum(1 for _ in itertools.takewhile(
+                lambda pair: pair[0] == pair[1],
+                zip(output_ids_0[num_prefix_tokens:],
+                    output_ids_1[num_prefix_tokens:]),
+            ))
+
+            assert num_tokens_to_pass < min_tokens_to_pass, (
+                "The original assertion should pass")
+
+            if num_tokens_to_pass < min_tokens_to_xfail:
+                raise
+
+            num_tokens_to_pass_exc_list.append((num_tokens_to_pass, e))
+
+    if num_tokens_to_pass_exc_list:
+        num_tokens_to_pass = min(pair[0]
+                                 for pair in num_tokens_to_pass_exc_list)
+        exc_list = [pair[1] for pair in num_tokens_to_pass_exc_list]
+
+        pytest.xfail(
+            f"Test only fully passes when max_tokens={num_tokens_to_pass} "
+            f"(instead of {min_tokens_to_pass}). "
+            f"Errors encountered per item: {exc_list}")
+
+
 TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
 
 
-def check_logprobs_close(outputs_0_lst: List[TokensTextLogprobs],
-                         outputs_1_lst: List[TokensTextLogprobs], name_0: str,
-                         name_1: str):
+def check_logprobs_close(
+    *,
+    outputs_0_lst: List[TokensTextLogprobs],
+    outputs_1_lst: List[TokensTextLogprobs],
+    name_0: str,
+    name_1: str,
+):
     """
     Compare the logprobs of two sequences generated by different models,
     which should be similar but not necessarily equal.

From 2c2558b08a294c265548134f05c95cf4e238597b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 06:11:41 +0000
Subject: [PATCH 121/181] Fix unused args

---
 tests/models/test_phi3v.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index c471154bdf39..7420d3c0a8c9 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -120,6 +120,8 @@ def run_test(
     with vllm_runner(model_id,
                      max_model_len=4096,
                      dtype=dtype,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs_per_size_factors = [

From ec28eca01cc2c7f85835741e0dc8cc892fb0f259 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 08:44:04 +0000
Subject: [PATCH 122/181] Check logprobs instead of xfailing

---
 tests/conftest.py                             |  59 ++++++--
 .../distributed/test_multimodal_broadcast.py  |   6 +-
 tests/models/test_llava.py                    |  52 ++++---
 tests/models/test_llava_next.py               |  50 +++----
 tests/models/test_phi3v.py                    |  84 ++++-------
 tests/models/utils.py                         | 136 +++++++-----------
 6 files changed, 182 insertions(+), 205 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index af1766d5a98d..ad672da029bd 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -243,7 +243,7 @@ def generate(
         self,
         prompts: List[str],
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
             assert len(prompts) == len(images)
@@ -278,7 +278,7 @@ def generate_greedy(
         prompts: List[str],
         max_tokens: int,
         images: Optional[List[Image.Image]] = None,
-        **kwargs,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
@@ -314,19 +314,30 @@ def generate_greedy_logprobs(
         self,
         prompts: List[str],
         max_tokens: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[List[torch.Tensor]]:
-        all_logprobs = []
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        all_logprobs: List[List[torch.Tensor]] = []
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
             for hidden_states in output.hidden_states:
                 last_hidden_states = hidden_states[-1][0]
                 logits = torch.matmul(
@@ -346,20 +357,32 @@ def generate_greedy_logprobs_limit(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[List[Image.Image]] = None,
+        **kwargs: Any,
     ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
         all_logprobs: List[List[Dict[int, float]]] = []
         all_output_ids: List[List[int]] = []
         all_output_strs: List[str] = []
 
-        for prompt in prompts:
-            input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
+        for i, prompt in enumerate(prompts):
+            processor_kwargs: Dict[str, Any] = {
+                "text": prompt,
+                "return_tensors": "pt",
+            }
+            if images is not None and images[i] is not None:
+                processor_kwargs["images"] = images[i]
+
+            inputs = self.processor(**processor_kwargs)
+            input_ids = inputs.input_ids
+
             output = self.model.generate(
-                self.wrap_device(input_ids),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
                 output_hidden_states=True,
                 return_dict_in_generate=True,
+                **kwargs,
             )
 
             seq_logprobs: List[torch.Tensor] = []
@@ -485,10 +508,19 @@ def generate_w_logprobs(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
+        images: Optional[GenericSequence[MultiModalData]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
-        req_outputs = self.model.generate(prompts,
+        if images is not None:
+            assert len(prompts) == len(images)
+
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = image
+
+        req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
         outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
         for req_output in req_outputs:
@@ -515,11 +547,14 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
+        images: Optional[GenericSequence[MultiModalData]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
                                                 logprobs=num_logprobs)
-        outputs = self.generate_w_logprobs(prompts, greedy_logprobs_params)
+        outputs = self.generate_w_logprobs(prompts,
+                                           greedy_logprobs_params,
+                                           images=images)
 
         return [(output_ids, output_str, output_logprobs)
                 for output_ids, output_str, output_logprobs in outputs]
diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py
index 0a4ae2513ca9..1d143a8526f4 100644
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -30,9 +30,10 @@
 @pytest.mark.parametrize("tensor_parallel_size", [2])
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets,
-                tensor_parallel_size: int, dtype: str,
-                max_tokens: int) -> None:
+                tensor_parallel_size: int, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     if cuda_device_count_stateless() < tensor_parallel_size:
         pytest.skip(
             f"Need at least {tensor_parallel_size} GPUs to run the test.")
@@ -47,6 +48,7 @@ def test_models(hf_runner, vllm_runner, image_assets,
         size_factors=[1.0],
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=tensor_parallel_size,
         distributed_executor_backend=distributed_executor_backend,
     )
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c51b09bce97c..68ac895d02c0 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -6,9 +6,10 @@
 from vllm.config import VisionLanguageConfig
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal_xfail
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
@@ -43,14 +44,15 @@ def iter_llava_configs(model_name: str):
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
+    output_ids, output_str, out_logprobs = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -60,10 +62,10 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = output_str \
-        .replace(image_token_str * vlm_config.image_feature_size, " ")
+    hf_output_str = output_str.lstrip() \
+        .replace(image_token_str * vlm_config.image_feature_size, "")
 
-    return hf_output_ids, hf_output_str
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 def run_test(
@@ -75,6 +77,7 @@ def run_test(
     size_factors: List[float],
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
@@ -114,42 +117,32 @@ def run_test(
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs_per_size_factors = [
-            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
             for prompts, vllm_images in vllm_inputs_per_size_factors
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts, max_tokens, images=hf_images)
-            for prompts, hf_images in hf_inputs_per_size_factors
-        ]
-        hf_dummy_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts, max_tokens=1, images=hf_images)
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images)
             for prompts, hf_images in hf_inputs_per_size_factors
         ]
 
-    # There may be numeric differences for multiscale images due to
-    # our implementation of CLIPVisionModel
-    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
-            image_inputs_per_size_factors,
-            vllm_outputs_per_size_factors,
-            hf_outputs_per_size_factors,
-            hf_dummy_outputs_per_size_factors,
-    ):
-        check_outputs_equal_xfail(
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
+                                        vllm_outputs_per_size_factors):
+        check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
                 vllm_to_hf_output(vllm_output, vlm_config, model_id)
                 for vllm_output in vllm_outputs
             ],
-            outputs_num_prefix_tokens=[
-                len(hf_dummy_output[0]) - 1
-                for hf_dummy_output in hf_dummy_outputs
-            ],
             name_0="hf",
             name_1="vllm",
-            min_tokens_to_xfail=1,
-            min_tokens_to_pass=max_tokens,
         )
 
 
@@ -169,8 +162,10 @@ def run_test(
 )
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
@@ -179,5 +174,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
         size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index bd42d09dc046..b60dd9012d12 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,5 +1,5 @@
 import re
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
@@ -7,9 +7,10 @@
 from vllm.config import VisionLanguageConfig
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
-from .utils import check_outputs_equal_xfail
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
@@ -53,14 +54,15 @@ def iter_llava_next_configs(model_name: str):
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
+    output_ids, output_str, out_logprobs = vllm_output
     image_token_id = vlm_config.image_token_id
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -70,9 +72,9 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
+    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str).lstrip()
 
-    return hf_output_ids, hf_output_str
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
@@ -91,8 +93,10 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
 )
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -128,40 +132,30 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs_per_size_factors = [
-            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
             for prompts, vllm_images in vllm_inputs_per_size_factors
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
         hf_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts, max_tokens, images=hf_images)
-            for prompts, hf_images in hf_inputs_per_size_factors
-        ]
-        hf_dummy_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts, max_tokens=1, images=hf_images)
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images)
             for prompts, hf_images in hf_inputs_per_size_factors
         ]
 
-    # There may be numeric differences for multiscale images due to
-    # our implementation of CLIPVisionModel
-    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
-            image_inputs_per_size_factors,
-            vllm_outputs_per_size_factors,
-            hf_outputs_per_size_factors,
-            hf_dummy_outputs_per_size_factors,
-    ):
-        check_outputs_equal_xfail(
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
+                                        vllm_outputs_per_size_factors):
+        check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
                 vllm_to_hf_output(vllm_output, vlm_config, model_id)
                 for vllm_output in vllm_outputs
             ],
-            outputs_num_prefix_tokens=[
-                len(hf_dummy_output[0]) - 1
-                for hf_dummy_output in hf_dummy_outputs
-            ],
             name_0="hf",
             name_1="vllm",
-            min_tokens_to_xfail=1,
-            min_tokens_to_pass=max_tokens,
         )
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 7420d3c0a8c9..5eaccf1f76ba 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -2,16 +2,16 @@
 from typing import List, Optional, Tuple, Type
 
 import pytest
-from transformers import AutoConfig, AutoTokenizer
+from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.model_executor.models.phi3v import get_phi3v_image_feature_size
 from vllm.multimodal.image import ImagePixelData
 from vllm.multimodal.utils import rescale_image_size
+from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
-from .utils import check_outputs_equal_xfail
+from .utils import check_logprobs_close
 
 pytestmark = pytest.mark.vlm
 
@@ -47,26 +47,26 @@ def iter_phi3v_configs(model_name: str):
 ]
 
 
-def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
-                      vlm_config: VisionLanguageConfig, model_id: str,
-                      image_feature_size: int):
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
+                                         Optional[SampleLogprobs]],
+                      vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    output_ids, output_str = vllm_output
-    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", " ", output_str)
+    output_ids, output_str, out_logprobs = vllm_output
+    output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
 
     hf_output_str = output_str_without_image.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     hf_output_ids = tokenizer.encode(output_str_without_image)
-    hf_output_ids = hf_output_ids[:4] + [0] * image_feature_size \
-        + [1] + hf_output_ids[4:]
+    assert hf_output_ids[:2] == [1, 29871]
+    hf_output_ids = hf_output_ids[2:]
 
-    return hf_output_ids, hf_output_str
+    return hf_output_ids, hf_output_str, out_logprobs
 
 
 target_dtype = "half"
@@ -83,6 +83,7 @@ def run_test(
     size_factors: List[float],
     dtype: str,
     max_tokens: int,
+    num_logprobs: int,
     tensor_parallel_size: int,
     distributed_executor_backend: Optional[str] = None,
 ):
@@ -96,7 +97,6 @@ def run_test(
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vlm_config = model_and_config
-    hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
 
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
@@ -125,7 +125,10 @@ def run_test(
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
         vllm_outputs_per_size_factors = [
-            vllm_model.generate_greedy(prompts, max_tokens, images=vllm_images)
+            vllm_model.generate_greedy_logprobs(prompts,
+                                                max_tokens,
+                                                num_logprobs=num_logprobs,
+                                                images=vllm_images)
             for prompts, vllm_images in vllm_inputs_per_size_factors
         ]
 
@@ -135,54 +138,24 @@ def run_test(
                    model_kwargs=hf_model_kwargs) as hf_model:
         eos_token_id = hf_model.processor.tokenizer.eos_token_id
         hf_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts,
-                                     max_tokens,
-                                     images=hf_images,
-                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in hf_inputs_per_size_factors
-        ]
-        hf_dummy_outputs_per_size_factors = [
-            hf_model.generate_greedy(prompts,
-                                     max_tokens=1,
-                                     images=hf_images,
-                                     eos_token_id=eos_token_id)
+            hf_model.generate_greedy_logprobs_limit(prompts,
+                                                    max_tokens,
+                                                    num_logprobs=num_logprobs,
+                                                    images=hf_images,
+                                                    eos_token_id=eos_token_id)
             for prompts, hf_images in hf_inputs_per_size_factors
         ]
 
-    # Since we use _attn_implementation="eager", there is numeric
-    # difference for longer context (max_tokens=128) and test can't pass
-    for image_inputs, vllm_outputs, hf_outputs, hf_dummy_outputs in zip(
-            image_inputs_per_size_factors,
-            vllm_outputs_per_size_factors,
-            hf_outputs_per_size_factors,
-            hf_dummy_outputs_per_size_factors,
-    ):
-        image_feature_sizes = [
-            get_phi3v_image_feature_size(
-                hf_config,
-                input_height=hf_image.height,
-                input_width=hf_image.width,
-            ) for _, hf_image, _ in image_inputs
-        ]
-
-        check_outputs_equal_xfail(
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
+                                        vllm_outputs_per_size_factors):
+        check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
-                vllm_to_hf_output(vllm_output,
-                                  vlm_config,
-                                  model_id,
-                                  image_feature_size=image_feature_size)
-                for vllm_output, image_feature_size in zip(
-                    vllm_outputs, image_feature_sizes)
-            ],
-            outputs_num_prefix_tokens=[
-                len(hf_dummy_output[0]) - 1
-                for hf_dummy_output in hf_dummy_outputs
+                vllm_to_hf_output(vllm_output, vlm_config, model_id)
+                for vllm_output in vllm_outputs
             ],
             name_0="hf",
             name_1="vllm",
-            min_tokens_to_xfail=1,
-            min_tokens_to_pass=max_tokens,
         )
 
 
@@ -202,8 +175,10 @@ def run_test(
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
-                size_factors, dtype: str, max_tokens: int) -> None:
+                size_factors, dtype: str, max_tokens: int,
+                num_logprobs: int) -> None:
     run_test(
         hf_runner,
         vllm_runner,
@@ -212,5 +187,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
         size_factors=size_factors,
         dtype=dtype,
         max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
         tensor_parallel_size=1,
     )
diff --git a/tests/models/utils.py b/tests/models/utils.py
index eaa550945220..51d57129d9d2 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,15 +1,15 @@
-import itertools
-from typing import Dict, List, Tuple
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-import pytest
+from vllm.sequence import SampleLogprobs
 
 TokensText = Tuple[List[int], str]
 
 
 def check_outputs_equal(
     *,
-    outputs_0_lst: List[TokensText],
-    outputs_1_lst: List[TokensText],
+    outputs_0_lst: Sequence[TokensText],
+    outputs_1_lst: Sequence[TokensText],
     name_0: str,
     name_1: str,
 ):
@@ -25,84 +25,27 @@ def check_outputs_equal(
         output_ids_0, output_str_0 = outputs_0
         output_ids_1, output_str_1 = outputs_1
 
-        assert output_str_0 == output_str_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-        assert output_ids_0 == output_ids_1, (f"Test{prompt_idx}:"
-                                              f"\n{name_0}:\t{output_str_0!r}"
-                                              f"\n{name_1}:\t{output_str_1!r}")
-
-
-def check_outputs_equal_xfail(
-    *,
-    outputs_0_lst: List[TokensText],
-    outputs_1_lst: List[TokensText],
-    outputs_num_prefix_tokens: List[int],
-    name_0: str,
-    name_1: str,
-    min_tokens_to_xfail: int,
-    min_tokens_to_pass: int,
-):
-    """
-    Compare the two sequences generated by different models, 
-    which should be equal, but we xfail instead of completely failing
-    the test if the outputs are equal up to a certain number of
-    `min_tokens_to_xfail` after omitting the first `num_prefix_tokens`.
-    """
-    assert len(outputs_0_lst) == len(outputs_1_lst)
-    assert len(outputs_0_lst) == len(outputs_num_prefix_tokens)
-
-    num_tokens_to_pass_exc_list: List[Tuple[int, AssertionError]] = []
-    for prompt_idx, (outputs_0, outputs_1, num_prefix_tokens) in enumerate(
-            zip(outputs_0_lst, outputs_1_lst, outputs_num_prefix_tokens)):
-
-        output_ids_0, output_str_0 = outputs_0
-        output_ids_1, output_str_1 = outputs_1
-
-        try:
-            assert output_str_0 == output_str_1, (
-                f"Test{prompt_idx}:"
-                f"\n{name_0}:\t{output_str_0!r}"
-                f"\n{name_1}:\t{output_str_1!r}")
-            assert output_ids_0 == output_ids_1, (
-                f"Test{prompt_idx}:"
-                f"\n{name_0}:\t{output_str_0!r}"
-                f"\n{name_1}:\t{output_str_1!r}")
-        except AssertionError as e:
-            num_tokens_to_pass = sum(1 for _ in itertools.takewhile(
-                lambda pair: pair[0] == pair[1],
-                zip(output_ids_0[num_prefix_tokens:],
-                    output_ids_1[num_prefix_tokens:]),
-            ))
-
-            assert num_tokens_to_pass < min_tokens_to_pass, (
-                "The original assertion should pass")
-
-            if num_tokens_to_pass < min_tokens_to_xfail:
-                raise
-
-            num_tokens_to_pass_exc_list.append((num_tokens_to_pass, e))
-
-    if num_tokens_to_pass_exc_list:
-        num_tokens_to_pass = min(pair[0]
-                                 for pair in num_tokens_to_pass_exc_list)
-        exc_list = [pair[1] for pair in num_tokens_to_pass_exc_list]
+        # The text and token outputs should exactly match
+        fail_msg = (f"Test{prompt_idx}:"
+                    f"\n{name_0}:\t{output_str_0!r}"
+                    f"\n{name_1}:\t{output_str_1!r}")
 
-        pytest.xfail(
-            f"Test only fully passes when max_tokens={num_tokens_to_pass} "
-            f"(instead of {min_tokens_to_pass}). "
-            f"Errors encountered per item: {exc_list}")
+        assert output_str_0 == output_str_1, fail_msg
+        assert output_ids_0 == output_ids_1, fail_msg
 
 
-TokensTextLogprobs = Tuple[List[int], str, List[Dict[int, float]]]
+TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
+                                                                    float]],
+                                                          SampleLogprobs]]]
 
 
 def check_logprobs_close(
     *,
-    outputs_0_lst: List[TokensTextLogprobs],
-    outputs_1_lst: List[TokensTextLogprobs],
+    outputs_0_lst: Sequence[TokensTextLogprobs],
+    outputs_1_lst: Sequence[TokensTextLogprobs],
     name_0: str,
     name_1: str,
+    warn_on_mismatch: bool = True,
 ):
     """
     Compare the logprobs of two sequences generated by different models,
@@ -117,21 +60,52 @@ def check_logprobs_close(
         output_ids_0, output_str_0, logprobs_0 = outputs_0
         output_ids_1, output_str_1, logprobs_1 = outputs_1
 
+        if logprobs_0 is None:
+            logprobs_0 = [None] * len(output_ids_0)
+        if logprobs_1 is None:
+            logprobs_1 = [None] * len(output_ids_1)
+
         # Loop through generated tokens.
         for idx, (output_id_0,
                   output_id_1) in enumerate(zip(output_ids_0, output_ids_1)):
 
             # If generated tokens don't match, then
             if output_id_0 != output_id_1:
+                logprobs_elem_0 = logprobs_0[idx]
+                logprobs_elem_1 = logprobs_1[idx]
+
                 # Each predicted token must be in top N logprobs of the other
-                assert output_id_0 in logprobs_1[idx], (
+                fail_msg = (
                     f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
-                assert output_id_1 in logprobs_0[idx], (
-                    f"Test{prompt_idx}:"
-                    f"\n{name_0}:\t{output_str_0!r}"
-                    f"\n{name_1}:\t{output_str_1!r}")
+                    f"\n{name_0}:\t{output_str_0!r}\t{logprobs_elem_0}"
+                    f"\n{name_1}:\t{output_str_1!r}\t{logprobs_elem_1}")
+
+                assert logprobs_elem_0 is not None, fail_msg
+                assert logprobs_elem_1 is not None, fail_msg
+                assert output_id_0 in logprobs_elem_1, fail_msg
+                assert output_id_1 in logprobs_elem_0, fail_msg
+
+                if warn_on_mismatch:
+                    with warnings.catch_warnings():
+                        # This ensures that repeated warnings are shown
+                        # in the output, not just the first occurrence
+                        warnings.simplefilter("always")
+
+                        warnings.warn(fail_msg, stacklevel=2)
 
                 # Break out since sequences will now diverge.
                 break
+        else:
+            if output_str_0 != output_str_1 and warn_on_mismatch:
+                # The token outputs exactly match,
+                # so the text outputs should exactly match as well
+                fail_msg = (f"Test{prompt_idx}:"
+                            f"\n{name_0}:\t{output_str_0!r}"
+                            f"\n{name_1}:\t{output_str_1!r}")
+
+                with warnings.catch_warnings():
+                    # This ensures that repeated warnings are shown
+                    # in the output, not just the first occurrence
+                    warnings.simplefilter("always")
+
+                    warnings.warn(fail_msg, stacklevel=2)

From 2eb349026a25d3fd3e27b2c1ee1d16c3b4889733 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 08:59:44 +0000
Subject: [PATCH 123/181] Fix different scales not being in the same batch

---
 tests/models/test_llava.py      | 27 +++++++++++++--------------
 tests/models/test_llava_next.py | 26 +++++++++++++-------------
 tests/models/test_phi3v.py      | 27 +++++++++++++--------------
 3 files changed, 39 insertions(+), 41 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 2a2207a1e838..23de2c476c5b 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -110,41 +110,40 @@ def run_test(
         # i.e. after creating vLLM instance.
         vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-        image_inputs_per_size_factors = [[(
+        image_inputs_per_image = [[(
             prompt,
             rescale_image_size(hf_image, factor),
             ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-        ) for hf_image, vllm_image, prompt in zip(hf_images, vllm_images,
-                                                  HF_IMAGE_PROMPTS)]
-                                         for factor in size_factors]
-        hf_inputs_per_size_factors = [(
+        ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
+            hf_images, vllm_images, HF_IMAGE_PROMPTS)]
+        hf_inputs_per_image = [(
             [prompt for prompt, hf_image, vllm_image in image_inputs],
             [hf_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_size_factors]
-        vllm_inputs_per_size_factors = [(
+        ) for image_inputs in image_inputs_per_image]
+        vllm_inputs_per_image = [(
             [prompt for prompt, hf_image, vllm_image in image_inputs],
             [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_size_factors]
+        ) for image_inputs in image_inputs_per_image]
 
-        vllm_outputs_per_size_factors = [
+        vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
                                                 images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_size_factors
+            for prompts, vllm_images in vllm_inputs_per_image
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs_per_size_factors = [
+        hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=hf_images)
-            for prompts, hf_images in hf_inputs_per_size_factors
+            for prompts, hf_images in hf_inputs_per_image
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
-                                        vllm_outputs_per_size_factors):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index b60dd9012d12..411e45f65ec4 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -110,20 +110,20 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
 
-    image_inputs_per_size_factors = [[(
+    image_inputs_per_image = [[(
         prompt,
         rescale_image_size(hf_image, factor),
         ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-    ) for hf_image, vllm_image, prompt in zip(
-        hf_images, vllm_images, HF_IMAGE_PROMPTS)] for factor in size_factors]
-    hf_inputs_per_size_factors = [(
+    ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
+        hf_images, vllm_images, HF_IMAGE_PROMPTS)]
+    hf_inputs_per_image = [(
         [prompt for prompt, hf_image, vllm_image in image_inputs],
         [hf_image for prompt, hf_image, vllm_image in image_inputs],
-    ) for image_inputs in image_inputs_per_size_factors]
-    vllm_inputs_per_size_factors = [(
+    ) for image_inputs in image_inputs_per_image]
+    vllm_inputs_per_image = [(
         [prompt for prompt, hf_image, vllm_image in image_inputs],
         [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-    ) for image_inputs in image_inputs_per_size_factors]
+    ) for image_inputs in image_inputs_per_image]
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
@@ -131,25 +131,25 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                      max_model_len=4096,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs_per_size_factors = [
+        vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
                                                 images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_size_factors
+            for prompts, vllm_images in vllm_inputs_per_image
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
-        hf_outputs_per_size_factors = [
+        hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=hf_images)
-            for prompts, hf_images in hf_inputs_per_size_factors
+            for prompts, hf_images in hf_inputs_per_image
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
-                                        vllm_outputs_per_size_factors):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 90eb93ae4d62..1b6ad3b12213 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -123,21 +123,20 @@ def run_test(
             for p in HF_IMAGE_PROMPTS
         ]
 
-        image_inputs_per_size_factors = [[(
+        image_inputs_per_image = [[(
             prompt,
             rescale_image_size(hf_image, factor),
             ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-        ) for hf_image, vllm_image, prompt in zip(hf_images, vllm_images,
-                                                  HF_IMAGE_PROMPTS)]
-                                         for factor in size_factors]
-        hf_inputs_per_size_factors = [(
+        ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
+            hf_images, vllm_images, HF_IMAGE_PROMPTS)]
+        hf_inputs_per_image = [(
             [prompt for prompt, hf_image, vllm_image in image_inputs],
             [hf_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_size_factors]
-        vllm_inputs_per_size_factors = [(
+        ) for image_inputs in image_inputs_per_image]
+        vllm_inputs_per_image = [(
             [prompt for prompt, hf_image, vllm_image in image_inputs],
             [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_size_factors]
+        ) for image_inputs in image_inputs_per_image]
 
         vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                                   max_tokens,
@@ -151,12 +150,12 @@ def run_test(
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        vllm_outputs_per_size_factors = [
+        vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
                                                 images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_size_factors
+            for prompts, vllm_images in vllm_inputs_per_image
         ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
@@ -164,17 +163,17 @@ def run_test(
     with hf_runner(model_id, dtype=dtype,
                    model_kwargs=hf_model_kwargs) as hf_model:
         eos_token_id = hf_model.processor.tokenizer.eos_token_id
-        hf_outputs_per_size_factors = [
+        hf_outputs_per_image = [
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
                                                     images=hf_images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in hf_inputs_per_size_factors
+            for prompts, hf_images in hf_inputs_per_image
         ]
 
-    for hf_outputs, vllm_outputs in zip(hf_outputs_per_size_factors,
-                                        vllm_outputs_per_size_factors):
+    for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
+                                        vllm_outputs_per_image):
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[

From 6301a52088176e9ed8751724d161772da94e5537 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <cyrus.tl.leung@gmail.com>
Date: Sun, 30 Jun 2024 17:02:58 +0800
Subject: [PATCH 124/181] Apply suggestions from code review

Co-authored-by: Roger Wang <136131678+ywang96@users.noreply.github.com>
---
 docs/source/dev/multimodal/adding_multimodal_model.rst | 3 ++-
 docs/source/dev/multimodal/multimodal_index.rst        | 2 +-
 examples/llava_example.py                              | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index de95485b84a9..137f4d6d6e94 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -98,7 +98,8 @@ Here are some examples:
 4. (Optional) Register input processor
 --------------------------------------
 
-Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor.
+Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's ``forward`` call.
 You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
 
 .. code-block:: diff
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index c68fe8dbb953..6ae5b8d3986f 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -10,7 +10,7 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 :class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
 which allows you to pass in multi-modal input alongside text and token prompts.
 
-By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`this guide <adding_a_new_multimodal_model>`.
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, please follow :ref:`the guide for adding a new multimodal model. <adding_a_new_multimodal_model>`.
 
 Guides
 ++++++
diff --git a/examples/llava_example.py b/examples/llava_example.py
index b2e1d53b593f..4fa0a1e247a2 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -22,7 +22,7 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False):
         disable_image_processor=disable_image_processor,
     )
 
-    prompt = "<image>\nUSER: What is the content of this image?\nASSISTANT:"
+    prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     if disable_image_processor:
         image = torch.load("images/stop_sign_pixel_values.pt")

From 14f10fc2aaf76fb4220fc440e221d7b257028d83 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 09:03:32 +0000
Subject: [PATCH 125/181] Add link

---
 docs/source/dev/multimodal/adding_multimodal_model.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/dev/multimodal/adding_multimodal_model.rst b/docs/source/dev/multimodal/adding_multimodal_model.rst
index 137f4d6d6e94..0e9590639b22 100644
--- a/docs/source/dev/multimodal/adding_multimodal_model.rst
+++ b/docs/source/dev/multimodal/adding_multimodal_model.rst
@@ -99,7 +99,7 @@ Here are some examples:
 --------------------------------------
 
 Sometimes, there is a need to process inputs at the :class:`~vllm.LLMEngine` level before they are passed to the model executor. 
-This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's ``forward`` call.
+This is often due to the fact that unlike implementations in HuggingFace Transformers, the reshaping and/or expansion of multi-modal embeddings needs to take place outside model's :meth:`~torch.nn.Module.forward` call.
 You can register input processors via :meth:`INPUT_REGISTRY.register_input_processor <vllm.inputs.registry.InputRegistry.register_input_processor>`.
 
 .. code-block:: diff

From 7c335c3f8f2da224dae73d3751f0247765bbff60 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 09:05:12 +0000
Subject: [PATCH 126/181] Use `self.multi_modal_projector` directly

---
 vllm/model_executor/models/llava_next.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 676c6f4b7160..7e71bc92b8e7 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -1,5 +1,4 @@
-from typing import (Callable, Iterable, List, Literal, Optional, Tuple,
-                    TypedDict)
+from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import torch
 import torch.nn as nn
@@ -373,7 +372,6 @@ def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
     def _process_image_pixels(
         self,
         inputs: LlavaNextImagePixelInputs,
-        proj: Callable[[torch.Tensor], torch.Tensor],
     ) -> BatchedTensors:
         assert self.vision_tower is not None
 
@@ -384,7 +382,8 @@ def _process_image_pixels(
             stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
             stacked_image_features = self._image_pixels_to_features(
                 self.vision_tower, stacked_pixel_values)
-            stacked_patch_embeddings = proj(stacked_image_features)
+            stacked_patch_embeddings = self.multi_modal_projector(
+                stacked_image_features)
 
             return stacked_patch_embeddings.view(
                 b, num_patches, *stacked_patch_embeddings.shape[1:])
@@ -395,14 +394,13 @@ def _process_image_pixels(
             self.vision_tower, stacked_pixel_values)
 
         return [
-            proj(image_features) for image_features in torch.split(
-                stacked_image_features, num_patches_per_batch)
+            self.multi_modal_projector(image_features) for image_features in
+            torch.split(stacked_image_features, num_patches_per_batch)
         ]
 
     def _process_image_input(
             self, image_input: LlavaNextImagePixelInputs) -> BatchedTensors:
-        patch_embeddings = self._process_image_pixels(
-            image_input, proj=self.multi_modal_projector)
+        patch_embeddings = self._process_image_pixels(image_input)
 
         image_sizes = image_input.get("image_sizes")
         if image_sizes is None:

From 33c860e05dc82ef74b7485a2bc5b2556654997a3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 09:13:17 +0000
Subject: [PATCH 127/181] Allow users to send image token formatted prompt
 directly

---
 vllm/entrypoints/openai/serving_chat.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index f70e32c517b4..38b539c0b404 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -147,9 +147,14 @@ def _parse_chat_message_content_parts(
                         "Multiple 'image_url' input is currently not supported."
                     )
 
-                image_token = self.image_token_str
-                if image_token is not None:
-                    texts.append(image_token)
+                image_token_str = self.image_token_str
+                if image_token_str is not None:
+                    if any(image_token_str in text for text in texts):
+                        logger.warning(
+                            "Detected image token string in the text prompt. "
+                            "Skipping prompt formatting.")
+                    else:
+                        texts.append(image_token_str)
 
                 image_url = cast(ChatCompletionContentPartImageParam,
                                  part)["image_url"]

From e03bc5786b219617e59738d052208ce2ae12afa1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 09:24:37 +0000
Subject: [PATCH 128/181] Factor out the code for placeholder token IDs

---
 vllm/model_executor/models/phi3v.py | 31 ++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index fafb315094a9..631be1b825a7 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from functools import lru_cache
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
 import numpy as np
@@ -21,7 +22,7 @@
 from transformers import CLIPVisionConfig, PretrainedConfig
 
 from vllm.attention import AttentionMetadata
-from vllm.config import CacheConfig, VisionLanguageConfig
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
@@ -340,6 +341,25 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
     return seq_data, mm_data
 
 
+# Reserve this function to also handle placeholders for additional images
+# [ref: PR #5820]
+@lru_cache
+def _get_image_placeholder_token_ids(model_config: ModelConfig,
+                                     idx: int) -> List[int]:
+    assert idx > 0
+
+    tokenizer = cached_get_tokenizer(model_config.tokenizer)
+
+    # We need to get the token for "<", not "▁<"
+    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
+    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
+    a_token_id_, *image_placeholder_token_ids = tokenizer.encode(
+        f"a<|image_{idx}|>", add_special_tokens=False)
+    assert a_token_id == a_token_id_
+
+    return image_placeholder_token_ids
+
+
 def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     multi_modal_data = llm_inputs.get("multi_modal_data")
     if multi_modal_data is None or not isinstance(
@@ -367,14 +387,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         image_feature_size = image_features.shape[-2]
 
     prompt_token_ids = llm_inputs["prompt_token_ids"]
-    tokenizer = cached_get_tokenizer(model_config.tokenizer)
-
-    # We need to get the token for "<", not "▁<"
-    # https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/raw/main/tokenizer.json
-    a_token_id, = tokenizer.encode("a", add_special_tokens=False)
-    a_token_id_, *image_1_token_ids = tokenizer.encode(
-        "a<|image_1|>", add_special_tokens=False)
-    assert a_token_id == a_token_id_
+    image_1_token_ids = _get_image_placeholder_token_ids(model_config, idx=1)
 
     new_token_ids: List[int] = []
     for i in range(len(prompt_token_ids) - len(image_1_token_ids) + 1):

From b270ac3f42842fc3bb268ebd440c892995986a98 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 09:25:55 +0000
Subject: [PATCH 129/181] Remove `-rx` flag

---
 .buildkite/test-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b205b0e8ac91..6931659dbc69 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -119,7 +119,7 @@ steps:
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s -rx models -m vlm
+    - pytest -v -s models -m vlm
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]

From 316122157d626284260124f4b5aabab6d6324a15 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 11:50:15 +0000
Subject: [PATCH 130/181] Fix distributed tests

---
 tests/models/test_llava.py | 7 +++++--
 tests/models/test_phi3v.py | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 23de2c476c5b..3ae00f79ca2a 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -4,8 +4,6 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
@@ -90,6 +88,11 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from vllm.multimodal.image import ImagePixelData
+    from vllm.multimodal.utils import rescale_image_size
+
     model_id, vlm_config = model_and_config
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 1b6ad3b12213..25e17a0ab6b0 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -5,8 +5,6 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
@@ -96,6 +94,11 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from vllm.multimodal.image import ImagePixelData
+    from vllm.multimodal.utils import rescale_image_size
+
     model_id, vlm_config = model_and_config
 
     # NOTE: take care of the order. run vLLM first, and then run HF.

From 85d108a1dd64384f4224f32cc5c113b45e61aa1e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 12:43:34 +0000
Subject: [PATCH 131/181] Fix string mismatch warning

---
 tests/models/test_llava.py      | 8 +++++++-
 tests/models/test_llava_next.py | 8 +++++++-
 tests/models/test_phi3v.py      | 7 +++++--
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 3ae00f79ca2a..1d33fbd6d846 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -55,13 +55,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
+    eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = output_str.lstrip() \
+
+    hf_output_str = output_str \
         .replace(image_token_str * vlm_config.image_feature_size, "")
+    assert hf_output_str[0] == " "
+    hf_output_str = hf_output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
 
     return hf_output_ids, hf_output_str, out_logprobs
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 411e45f65ec4..e13b9e6badc8 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -67,12 +67,18 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     image_token_str = tokenizer.decode(image_token_id)
+    eos_token_id = tokenizer.eos_token_id
 
     hf_output_ids = [
         token_id for idx, token_id in enumerate(output_ids)
         if token_id != image_token_id or output_ids[idx - 1] != image_token_id
     ]
-    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str).lstrip()
+
+    hf_output_str = re.sub(fr"({image_token_str})+", "", output_str)
+    assert hf_output_str[0] == " "
+    hf_output_str = hf_output_str[1:]
+    if hf_output_ids[-1] == eos_token_id:
+        hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
 
     return hf_output_ids, hf_output_str, out_logprobs
 
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 25e17a0ab6b0..6acfd2582bf5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -54,15 +54,18 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
     output_ids, output_str, out_logprobs = vllm_output
+
     output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
+    assert output_str_without_image[0] == " "
+    output_str_without_image = output_str_without_image[1:]
 
     hf_output_str = output_str_without_image.replace("<|user|>", "") \
         .replace("<|end|>\n<|assistant|>", " ")
 
     tokenizer = AutoTokenizer.from_pretrained(model_id)
     hf_output_ids = tokenizer.encode(output_str_without_image)
-    assert hf_output_ids[:2] == [1, 29871]
-    hf_output_ids = hf_output_ids[2:]
+    assert hf_output_ids[0] == 1
+    hf_output_ids = hf_output_ids[1:]
 
     return hf_output_ids, hf_output_str, out_logprobs
 

From d648e32ad681b7a61d835509e316025af88fe49e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 12:50:43 +0000
Subject: [PATCH 132/181] Relax phi3v test; add TODO for llava tests

---
 tests/models/test_llava.py      | 2 ++
 tests/models/test_llava_next.py | 2 ++
 tests/models/test_phi3v.py      | 4 +++-
 3 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 1d33fbd6d846..722e43978400 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -153,6 +153,8 @@ def run_test(
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                         vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index e13b9e6badc8..8730539bac70 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -156,6 +156,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
                                         vllm_outputs_per_image):
+        # TODO: Check whether using original CLIPVisionModel can improve
+        # consistency against HF
         check_logprobs_close(
             outputs_0_lst=hf_outputs,
             outputs_1_lst=[
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 6acfd2582bf5..e3f3aba2c669 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -191,6 +191,8 @@ def run_test(
         )
 
 
+# Since we use _attn_implementation="eager" for hf_runner, there is more
+# significant numerical difference. The basic `logprobs=5` fails to pass.
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize(
     "size_factors",
@@ -207,7 +209,7 @@ def run_test(
 )
 @pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
+@pytest.mark.parametrize("num_logprobs", [10])
 def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
                 size_factors, dtype: str, max_tokens: int,
                 num_logprobs: int) -> None:

From fde5f26910da47857b5ea8e7e853221535c8553f Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Sun, 30 Jun 2024 13:54:59 +0000
Subject: [PATCH 133/181] Fix distributed tests

---
 tests/models/test_llava_next.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 8730539bac70..aa7a890dc337 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -5,8 +5,6 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.multimodal.image import ImagePixelData
-from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
@@ -112,6 +110,11 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from vllm.multimodal.image import ImagePixelData
+    from vllm.multimodal.utils import rescale_image_size
+
     model_id, vlm_config = model_and_config
     hf_images = [asset.for_hf() for asset in image_assets]
     vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]

From d432934f824b1d71a822b03538e8c6f2f020f1f3 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Sun, 30 Jun 2024 18:58:31 -0700
Subject: [PATCH 134/181] address comments

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 docs/source/models/vlm.rst               |  2 +-
 tests/conftest.py                        |  8 ---
 vllm/entrypoints/openai/serving_chat.py  | 69 +++++++++++-------------
 vllm/model_executor/models/llava.py      |  8 ---
 vllm/model_executor/models/llava_next.py |  3 ++
 vllm/multimodal/utils.py                 |  7 +--
 6 files changed, 37 insertions(+), 60 deletions(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 639e7a9a9284..d4bb86ecf576 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`ExternalMultiModalDataDict`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.ExternalMultiModalDataDict`.
 
 .. code-block:: python
 
diff --git a/tests/conftest.py b/tests/conftest.py
index eff34e9d2937..54c250692df5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -46,14 +46,6 @@ def _read_prompts(filename: str) -> List[str]:
 class ImageAsset:
     name: Literal["stop_sign", "cherry_blossom"]
 
-    @cached_property
-    def pixel_values(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
-
-    @cached_property
-    def image_features(self) -> torch.Tensor:
-        return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
-
     @cached_property
     def pil_image(self) -> Image.Image:
         return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index bd0d82545ca5..a5ee08c57a09 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -26,9 +26,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.multimodal.image import ImageData
-from vllm.multimodal.utils import (async_get_and_parse_image,
-                                   get_full_image_text_prompt)
+from vllm.multimodal import ExternalMultiModalDataDict
+from vllm.multimodal.utils import get_full_image_text_prompt, ImageFetchAiohttp
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -47,7 +46,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    image_futures: List[Awaitable[ImageData]] = field(default_factory=list)
+    mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(default_factory=list)
 
 
 class OpenAIServingChat(OpenAIServing):
@@ -102,7 +101,7 @@ def _parse_chat_message_content_parts(
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
-        image_futures: List[Awaitable[ImageData]] = []
+        mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = []
 
         vlm_config: Optional[VisionLanguageConfig] = getattr(
             self.engine.engine, "vision_language_config", None)
@@ -112,39 +111,36 @@ def _parse_chat_message_content_parts(
             part_type = part["type"]
             if part_type == "text":
                 text = cast(ChatCompletionContentPartTextParam, part)["text"]
-
                 texts.append(text)
             elif part_type == "image_url":
                 if vlm_config is None:
                     raise ValueError(
                         "'image_url' input is not supported as the loaded "
                         "model is not multimodal.")
+                assert self.tokenizer is not None
+                image_url = cast(ChatCompletionContentPartImageParam,
+                                    part)["image_url"]
 
-                elif len(image_futures) == 0:
-                    assert self.tokenizer is not None
-                    image_url = cast(ChatCompletionContentPartImageParam,
-                                     part)["image_url"]
-
-                    if image_url.get("detail", "auto") != "auto":
-                        logger.warning(
-                            "'image_url.detail' is currently not supported and "
-                            "will be ignored.")
+                if image_url.get("detail", "auto") != "auto":
+                    logger.warning(
+                        "'image_url.detail' is currently not supported and "
+                        "will be ignored.")
 
-                    image_future = async_get_and_parse_image(image_url["url"])
-                    image_futures.append(image_future)
+                async def async_get_and_parse_image(image_url: str):
+                    with await ImageFetchAiohttp.fetch_image(image_url) as image:
+                        return {"image": image}
 
-                else:
-                    raise NotImplementedError(
-                        "Multiple 'image_url' input is currently not supported."
-                    )
+                mm_future = async_get_and_parse_image(image_url["url"])
+                mm_futures.append(mm_future)
 
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
         text_prompt = "\n".join(texts)
 
-        if vlm_config is not None and len(image_futures):
-
+        if vlm_config is not None and len(mm_futures):
+            
+            assert len(mm_futures) == 1, "Multiple images is not supported."
             (image_token_prompt,
              image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
 
@@ -171,7 +167,7 @@ def _parse_chat_message_content_parts(
             messages = [ConversationMessage(role=role, content=text_prompt)]
 
         return ChatMessageParseResult(messages=messages,
-                                      image_futures=image_futures)
+                                      mm_futures=mm_futures)
 
     def _parse_chat_message_content(
         self,
@@ -181,10 +177,10 @@ def _parse_chat_message_content(
         content = message.get("content")
 
         if content is None:
-            return ChatMessageParseResult(messages=[], image_futures=[])
+            return ChatMessageParseResult(messages=[], mm_futures=[])
         if isinstance(content, str):
             messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages, image_futures=[])
+            return ChatMessageParseResult(messages=messages, mm_futures=[])
 
         return self._parse_chat_message_content_parts(role, content)
 
@@ -209,13 +205,13 @@ async def create_chat_completion(
 
         try:
             conversation: List[ConversationMessage] = []
-            image_futures: List[Awaitable[ImageData]] = []
+            mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = []
 
             for msg in request.messages:
                 chat_parsed_result = self._parse_chat_message_content(msg)
 
                 conversation.extend(chat_parsed_result.messages)
-                image_futures.extend(chat_parsed_result.image_futures)
+                mm_futures.extend(chat_parsed_result.mm_futures)
 
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
@@ -226,15 +222,14 @@ async def create_chat_completion(
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
-        # Fetch image data
-        image_data: Optional[ImageData] = None
+        mm_data: Optional[ExternalMultiModalDataDict] = None
         try:
-            if len(image_futures):
-                # since we support only single image currently
-                assert len(image_futures) == 1
-                image_data = await image_futures[0]
+            if len(mm_futures):
+                # since we support only single mm data currently
+                assert len(mm_futures) == 1
+                mm_data = await mm_futures[0]
         except Exception as e:
-            logger.error("Error in loading image data: %s", e)
+            logger.error("Error in loading multi-modal data: %s", e)
             return self.create_error_response(str(e))
 
         request_id = f"cmpl-{random_uuid()}"
@@ -265,8 +260,8 @@ async def create_chat_completion(
             "prompt": prompt_text,
             "prompt_token_ids": prompt_ids,
         }
-        if image_data is not None:
-            inputs["multi_modal_data"] = image_data
+        if mm_data is not None:
+            inputs["multi_modal_data"] = mm_data
 
         is_tracing_enabled = await self.engine.is_tracing_enabled()
         trace_headers = None
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 72d39bb124ef..78823a513488 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -228,14 +228,6 @@ def forward(
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
             pixel_values: The pixels in each input image.
-                Expects a batch with shape `[1, 3, 336, 336]`.
-                (Only applicable to `PIXEL_VALUES` mode)
-
-        See also:
-            Each input maps to huggingface implementation, as follows:
-
-            - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360
-            - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437
         """
         image_input = self._parse_and_validate_image_input(**kwargs)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index dfdf512b968a..e18198720ed9 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -232,6 +232,9 @@ def _parse_and_validate_image_input(
         pixel_values = kwargs.pop("pixel_values", None)
         image_sizes = kwargs.pop("image_sizes", None)
 
+        if pixel_values is None or image_sizes is None:
+            return None
+
         if not isinstance(pixel_values, torch.Tensor):
             raise ValueError("Incorrect type of pixel values. "
                              f"Got type: {type(pixel_values)}")
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index c80cd7adde58..aad9822db78d 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,7 +8,6 @@
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-from vllm.multimodal.image import ImageData
 
 
 class ImageFetchAiohttp:
@@ -53,14 +52,10 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
                 "Invalid 'image_url': A valid 'image_url' must start "
                 "with either 'data:image' or 'http'.")
 
+        image.load()
         return image
 
 
-async def async_get_and_parse_image(image_url: str) -> ImageData:
-    with await ImageFetchAiohttp.fetch_image(image_url) as image:
-        return ImageData(image)
-
-
 def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
     """Encode a pillow image to base64 format."""
 

From ab347bc649c2867114d6e7c0d044ea795bc895a6 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Sun, 30 Jun 2024 19:11:36 -0700
Subject: [PATCH 135/181] format

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index a5ee08c57a09..8c844f83077d 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -27,7 +27,7 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.multimodal import ExternalMultiModalDataDict
-from vllm.multimodal.utils import get_full_image_text_prompt, ImageFetchAiohttp
+from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -46,7 +46,8 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(default_factory=list)
+    mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(
+        default_factory=list)
 
 
 class OpenAIServingChat(OpenAIServing):
@@ -119,7 +120,7 @@ def _parse_chat_message_content_parts(
                         "model is not multimodal.")
                 assert self.tokenizer is not None
                 image_url = cast(ChatCompletionContentPartImageParam,
-                                    part)["image_url"]
+                                 part)["image_url"]
 
                 if image_url.get("detail", "auto") != "auto":
                     logger.warning(
@@ -127,7 +128,8 @@ def _parse_chat_message_content_parts(
                         "will be ignored.")
 
                 async def async_get_and_parse_image(image_url: str):
-                    with await ImageFetchAiohttp.fetch_image(image_url) as image:
+                    with await ImageFetchAiohttp.fetch_image(image_url
+                                                             ) as image:
                         return {"image": image}
 
                 mm_future = async_get_and_parse_image(image_url["url"])
@@ -139,7 +141,7 @@ async def async_get_and_parse_image(image_url: str):
         text_prompt = "\n".join(texts)
 
         if vlm_config is not None and len(mm_futures):
-            
+
             assert len(mm_futures) == 1, "Multiple images is not supported."
             (image_token_prompt,
              image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
@@ -166,8 +168,7 @@ async def async_get_and_parse_image(image_url: str):
         else:
             messages = [ConversationMessage(role=role, content=text_prompt)]
 
-        return ChatMessageParseResult(messages=messages,
-                                      mm_futures=mm_futures)
+        return ChatMessageParseResult(messages=messages, mm_futures=mm_futures)
 
     def _parse_chat_message_content(
         self,

From 404700f8e4d683c3ae3006be9f31c5d3e31bd68c Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Sun, 30 Jun 2024 20:55:58 -0700
Subject: [PATCH 136/181] rm ctx

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 vllm/entrypoints/openai/serving_chat.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 8c844f83077d..55d5c383a6ee 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -128,9 +128,8 @@ def _parse_chat_message_content_parts(
                         "will be ignored.")
 
                 async def async_get_and_parse_image(image_url: str):
-                    with await ImageFetchAiohttp.fetch_image(image_url
-                                                             ) as image:
-                        return {"image": image}
+                    image = await ImageFetchAiohttp.fetch_image(image_url)
+                    return {"image": image}
 
                 mm_future = async_get_and_parse_image(image_url["url"])
                 mm_futures.append(mm_future)

From 95a1fc54f6e3fe10e1351da7956cb21ad8e7a89c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 1 Jul 2024 06:49:09 +0000
Subject: [PATCH 137/181] Fix distributed test

---
 tests/models/test_llava.py      | 9 ++++++---
 tests/models/test_llava_next.py | 9 ++++++---
 tests/models/test_phi3v.py      | 9 ++++++---
 tests/models/utils.py           | 8 +++++---
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 722e43978400..51f8a1a44295 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,14 +1,17 @@
-from typing import List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
+if TYPE_CHECKING:
+    # it may call torch.cuda.device_count()
+    from vllm.sequence import SampleLogprobs
+
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -43,7 +46,7 @@ def iter_llava_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
+                                         Optional["SampleLogprobs"]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index aa7a890dc337..5ec1907c43d8 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,15 +1,18 @@
 import re
-from typing import List, Optional, Tuple
+from typing import TYPE_CHECKING, List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
 from .utils import check_logprobs_close
 
+if TYPE_CHECKING:
+    # it may call torch.cuda.device_count()
+    from vllm.sequence import SampleLogprobs
+
 pytestmark = pytest.mark.vlm
 
 _PREFACE = (
@@ -53,7 +56,7 @@ def iter_llava_next_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
+                                         Optional["SampleLogprobs"]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index e3f3aba2c669..d460133e40e5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,16 +1,19 @@
 import re
-from typing import List, Optional, Tuple, Type
+from typing import TYPE_CHECKING, List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
-from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
+if TYPE_CHECKING:
+    # it may call torch.cuda.device_count()
+    from vllm.sequence import SampleLogprobs
+
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
@@ -46,7 +49,7 @@ def iter_phi3v_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional[SampleLogprobs]],
+                                         Optional["SampleLogprobs"]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 51d57129d9d2..a6c46554edaf 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,7 +1,9 @@
 import warnings
-from typing import Dict, List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
 
-from vllm.sequence import SampleLogprobs
+if TYPE_CHECKING:
+    # it may call torch.cuda.device_count()
+    from vllm.sequence import SampleLogprobs
 
 TokensText = Tuple[List[int], str]
 
@@ -36,7 +38,7 @@ def check_outputs_equal(
 
 TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
                                                                     float]],
-                                                          SampleLogprobs]]]
+                                                          "SampleLogprobs"]]]
 
 
 def check_logprobs_close(

From 1e878235adc65e770036526214d3648d2fcb37c3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 1 Jul 2024 07:21:44 +0000
Subject: [PATCH 138/181] Update docs about prompt formatting

---
 docs/source/models/vlm.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 4335f974ae57..1994562f2e57 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -52,6 +52,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
 
 .. code-block:: python
 
+    # Refer to the HuggingFace repo for the correct format to use
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
 
     # Load the image using PIL.Image
@@ -114,6 +115,8 @@ To consume the server, you can use the OpenAI client like in the example below:
         messages=[{
             "role": "user",
             "content": [
+                # NOTE: The prompt formatting with the image token `<image>` is not needed
+                # since the prompt will be processed automatically by the API server.
                 {"type": "text", "text": "What's in this image?"},
                 {
                     "type": "image_url",

From 55ab3e4b7f40e025f231f03fb163c5adfc0f1b5c Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 1 Jul 2024 07:23:33 +0000
Subject: [PATCH 139/181] Remove unused parameter

---
 vllm/model_executor/models/clip.py       | 1 -
 vllm/model_executor/models/llava.py      | 2 --
 vllm/model_executor/models/llava_next.py | 2 --
 vllm/model_executor/models/phi3v.py      | 1 -
 4 files changed, 6 deletions(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index 6b4b91c671cc..e9276f354c22 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -86,7 +86,6 @@ def dummy_feature_data_for_clip(
 
 def input_processor_for_clip(
     model_config: ModelConfig,
-    multimodal_config: VisionLanguageConfig,
     hf_config: CLIPVisionConfig,
     llm_inputs: LLMInputs,
     *,
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 56d84c3b5d33..a7c5fba7fcd8 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -102,14 +102,12 @@ def input_processor_for_llava(ctx: InputContext, llm_inputs: LLMInputs):
         return llm_inputs
 
     model_config = ctx.model_config
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(LlavaConfig)
     vision_config = hf_config.vision_config
 
     if isinstance(vision_config, CLIPVisionConfig):
         return input_processor_for_clip(
             model_config,
-            multimodal_config,
             vision_config,
             llm_inputs,
             image_token_id=hf_config.image_token_index,
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 7e71bc92b8e7..064ced11d0d4 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -165,7 +165,6 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
         return llm_inputs
 
     model_config = ctx.model_config
-    multimodal_config = ctx.get_multimodal_config()
     hf_config = ctx.get_hf_config(LlavaNextConfig)
     vision_config = hf_config.vision_config
 
@@ -187,7 +186,6 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
     if isinstance(vision_config, CLIPVisionConfig):
         return input_processor_for_clip(
             model_config,
-            multimodal_config,
             vision_config,
             llm_inputs,
             image_token_id=hf_config.image_token_index,
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 631be1b825a7..17f5802cd0a9 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -407,7 +407,6 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
     return input_processor_for_clip(
         model_config,
-        multimodal_config,
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         llm_inputs,
         image_token_id=multimodal_config.image_token_id,

From 21da5b8c70ce479737f304d7ca435433f401fe1a Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 1 Jul 2024 07:23:59 +0000
Subject: [PATCH 140/181] Remove unused import

---
 vllm/model_executor/models/clip.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index e9276f354c22..4f4f1f4d6efd 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -8,7 +8,7 @@
 from transformers import CLIPVisionConfig
 from transformers.models.clip.modeling_clip import CLIPAttention
 
-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.inputs import LLMInputs
 from vllm.model_executor.layers.activation import get_act_fn
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,

From 525fe8f8753a84e2834d954f0a6a3f2c019421a3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 1 Jul 2024 08:30:30 +0000
Subject: [PATCH 141/181] Fix distributed test

---
 vllm/config.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 45e181aa7007..8e0b9ce95931 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -10,7 +10,6 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
-from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
@@ -177,6 +176,9 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_embedding_mode(self) -> None:
+        # it may call torch.cuda.device_count()
+        from vllm.model_executor.models import ModelRegistry
+
         architectures = getattr(self.hf_config, "architectures", [])
         self.embedding_mode = any(
             ModelRegistry.is_embedding_model(arch) for arch in architectures)

From 04ebb6809a77e81bd41c7a9565b8ba2a522d98b1 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 08:18:51 -0700
Subject: [PATCH 142/181] rm ImageData and MultiModalData

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 .../dev/multimodal/multimodal_index.rst       |   2 +-
 examples/openai_vision_api_client.py          |   1 -
 tests/conftest.py                             |  15 +--
 tests/models/test_llava.py                    |   4 +-
 tests/models/test_llava_next.py               |   4 +-
 tests/models/test_phi3v.py                    |   4 +-
 tests/multimodal/test_mapper.py               |   9 +-
 tests/spec_decode/e2e/conftest.py             |   4 +-
 vllm/inputs/data.py                           |   8 +-
 vllm/inputs/registry.py                       |   7 +-
 vllm/model_executor/models/clip.py            |   5 +-
 vllm/model_executor/models/llava.py           |   7 +-
 vllm/model_executor/models/llava_next.py      |  38 +++---
 vllm/model_executor/models/phi3v.py           |  15 ++-
 vllm/multimodal/__init__.py                   |   3 +-
 vllm/multimodal/base.py                       |  57 +++------
 vllm/multimodal/image.py                      |  39 ++----
 vllm/multimodal/registry.py                   | 114 +++++++-----------
 vllm/sequence.py                              |   4 +-
 19 files changed, 126 insertions(+), 214 deletions(-)

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index f6fdfc1debff..d551eb899acc 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -29,7 +29,7 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.MultiModalData
+.. autoclass:: vllm.multimodal.ExternalMultiModalDataDict
     :members:
     :show-inheritance:
 
diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py
index 26f2aa651fca..fcda1345f576 100644
--- a/examples/openai_vision_api_client.py
+++ b/examples/openai_vision_api_client.py
@@ -3,7 +3,6 @@
 Launch the vLLM server with the following command:
 python -m vllm.entrypoints.openai.api_server \
     --model llava-hf/llava-1.5-7b-hf \
-    --image-input-type pixel_values \
     --image-token-id 32000 \
     --image-input-shape 1,3,336,336 \
     --image-feature-size 576 \
diff --git a/tests/conftest.py b/tests/conftest.py
index 43609f257da9..dca87149ce32 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,8 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
-                    TypedDict, TypeVar)
+from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
+                    TypeVar)
 
 import pytest
 import torch
@@ -22,12 +22,7 @@
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-
-if TYPE_CHECKING:
-    from vllm.multimodal import MultiModalData
-else:
-    # it will call torch.cuda.device_count()
-    MultiModalData = None
+from vllm.multimodal import ExternalMultiModalDataDict
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -433,7 +428,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[ExternalMultiModalDataDict]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -482,7 +477,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[MultiModalData]] = None,
+        images: Optional[List[ExternalMultiModalDataDict]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f2bac4235d8f..7f4a3a597874 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -75,8 +75,8 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index b7dc3e8ef796..b03e00923ca2 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -82,8 +82,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 7297afb4a1c9..5809e6f83755 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -84,8 +84,8 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalData objects and corresponding
-    vision language config as input.
+    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 2238b0e9f9f7..bdbbd9abfc5c 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -4,7 +4,6 @@
 
 from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImageData
 
 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
@@ -33,7 +32,7 @@ def test_clip_image_processor(image_assets, dtype):
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImageData(asset.pil_image),
+            {"image": asset.pil_image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -72,7 +71,7 @@ def test_llava_next_image_processor(image_assets, dtype):
         ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImageData(asset.pil_image),
+            {"image": asset.pil_image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -102,11 +101,11 @@ def test_image_pixel_types(image_assets, dtype):
     for asset in image_assets:
         image_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImageData(asset.pil_image),
+            {"image": asset.pil_image},
         )
         tensor_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            ImageData(asset.pil_image),
+            {"image": asset.pil_image},
         )
 
         assert image_result.keys() == tensor_result.keys()
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 60dfe33f2918..f75caef6a5b5 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -11,7 +11,7 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import MultiModalData
+from vllm.multimodal import ExternalMultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
@@ -91,7 +91,7 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional[ExternalMultiModalDataDict] = None,
     ) -> List[RequestOutput]:
 
         if prompts is None:
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index 2f761057bea5..df8c38ead21a 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData
+    from vllm.multimodal import ExternalMultiModalDataDict
 
 
 class ParsedText(TypedDict):
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -85,7 +85,7 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """A list of token IDs to pass to the model."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
-    multi_modal_data: NotRequired["MultiModalData"]
+    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 8f4e108b8cca..85552db4aefe 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VisionLanguageConfig
-    from vllm.multimodal import MultiModalData
+    from vllm.multimodal import ExternalMultiModalDataDict
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -66,7 +66,8 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
 N = TypeVar("N", bound=Type[nn.Module])
 
 DummyDataFactory = Callable[[InputContext, int],
-                            Tuple["SequenceData", Optional["MultiModalData"]]]
+                            Tuple["SequenceData",
+                                  Optional["ExternalMultiModalDataDict"]]]
 """
 Create dummy data to be inputted into the model.
 
@@ -94,7 +95,7 @@ def _default_dummy_data_factory(
         self,
         ctx: InputContext,
         seq_len: int,
-    ) -> Tuple["SequenceData", Optional["MultiModalData"]]:
+    ) -> Tuple["SequenceData", Optional["ExternalMultiModalDataDict"]]:
         """
         The default dummy data factory represents the longest possible text
         that can be inputted to the model.
diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py
index f6dfbf17a792..5212e2808fb3 100644
--- a/vllm/model_executor/models/clip.py
+++ b/vllm/model_executor/models/clip.py
@@ -12,7 +12,6 @@
 from vllm.model_executor.layers.linear import (ColumnParallelLinear,
                                                RowParallelLinear)
 from vllm.model_executor.layers.quantization import QuantizationConfig
-from vllm.multimodal.image import ImageData
 from vllm.sequence import SequenceData
 
 
@@ -49,7 +48,7 @@ def dummy_seq_data_for_clip(
     return SequenceData(token_ids)
 
 
-def dummy_pixel_data_for_clip(
+def dummy_image_for_clip(
     hf_config: CLIPVisionConfig,
     *,
     image_width_override: Optional[int] = None,
@@ -62,7 +61,7 @@ def dummy_pixel_data_for_clip(
         height = image_height_override
 
     image = Image.new("RGB", (width, height), color=0)
-    return ImageData(image)
+    return {"image": image}
 
 
 # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 8c3926e4a5f0..5e48c5530a0c 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,10 +17,10 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput
 
-from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
 
 _KEYS_TO_MODIFY_MAPPING = {
@@ -89,8 +89,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int):
             image_token_id=hf_config.image_token_index,
         )
 
-        mm_data: MultiModalData
-        mm_data = dummy_pixel_data_for_clip(vision_config)
+        mm_data = dummy_image_for_clip(vision_config)
         return seq_data, mm_data
 
     msg = f"Unsupported vision config: {type(vision_config)}"
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index c113ead67b05..3c0988137f7c 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -2,6 +2,7 @@
 
 import torch
 import torch.nn as nn
+from PIL import Image
 from transformers import CLIPVisionConfig, LlavaNextConfig
 from transformers.models.llava_next.modeling_llava_next import (
     get_anyres_image_grid_shape, unpad_image)
@@ -20,11 +21,10 @@
 from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
-from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
-from vllm.multimodal.image import ImageData
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput
 
-from .clip import (dummy_pixel_data_for_clip, dummy_seq_data_for_clip,
+from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip,
                    get_clip_patch_grid_length)
 from .interfaces import SupportsVision
 from .llava import LlavaMultiModalProjector, merge_vision_embeddings
@@ -127,8 +127,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
             image_feature_size_override=image_feature_size,
         )
 
-        mm_data: MultiModalData
-        mm_data = dummy_pixel_data_for_clip(
+        mm_data = dummy_image_for_clip(
             vision_config,
             image_width_override=dummy_width,
             image_height_override=dummy_height,
@@ -140,28 +139,23 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     raise NotImplementedError(msg)
 
 
-def _pixel_mapper(ctx: InputContext,
-                  data: ImageData) -> Dict[str, torch.Tensor]:
-    image = data.image
+def _pixel_mapper(ctx: InputContext, image: object) -> Dict[str, torch.Tensor]:
 
-    if isinstance(image, torch.Tensor):
-        pixel_values = image.to(ctx.model_config.dtype)
-        batch_size, _, _, h, w = pixel_values.shape
-        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+    if isinstance(image, Image.Image):
 
-        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+        # Temporary patch before dynamic number of image tokens is supported
+        _, _, h, w = ctx.get_multimodal_config().image_input_shape
+        if (w, h) != (image.width, image.height):
+            logger.warning(
+                "Dynamic image shape is currently not supported. "
+                "Resizing input image to (%d, %d).", w, h)
 
-    # Temporary patch before dynamic number of image tokens is supported
-    _, _, h, w = ctx.get_multimodal_config().image_input_shape
-    if (w, h) != (image.width, image.height):
-        logger.warning(
-            "Dynamic image shape is currently not supported. "
-            "Resizing input image to (%d, %d).", w, h)
+            image = image.resize((w, h))
 
-        data.image = image.resize((w, h))
+        return MULTIMODAL_REGISTRY._get_plugin("image") \
+            ._default_input_mapper(ctx, image)
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \
-        ._default_input_mapper(ctx, data)
+    raise TypeError(f"Invalid type for 'image': {type(image)}")
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(_pixel_mapper)
diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 5e53fed9aa92..a16f7f0ea570 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,10 +35,9 @@
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImageData
 from vllm.sequence import SamplerOutput
 
-from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision
 
 logger = init_logger(__name__)
@@ -286,7 +285,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
         image_token_id=32044,
         image_feature_size_override=image_feature_size,
     )
-    mm_data = dummy_pixel_data_for_clip(
+    mm_data = dummy_image_for_clip(
         CLIP_VIT_LARGE_PATCH14_336_CONFIG,
         image_width_override=dummy_width,
         image_height_override=dummy_height,
@@ -331,8 +330,7 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):
 
 
 def _image_processor(ctx: InputContext,
-                     data: ImageData) -> Dict[str, torch.Tensor]:
-    image = data.image
+                     image: object) -> Dict[str, torch.Tensor]:
 
     if isinstance(image, Image.Image):
         # Temporary patch before dynamic number of image tokens is supported
@@ -343,10 +341,11 @@ def _image_processor(ctx: InputContext,
                 "Dynamic image shape is currently not supported. "
                 "Resizing input image to (%d, %d).", w, h)
 
-            data.image = image.resize((w, h))
+            image = image.resize((w, h))
 
-    return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \
-            ._default_input_mapper(ctx, data)
+        return MULTIMODAL_REGISTRY._get_plugin("image") \
+                ._default_input_mapper(ctx, image)
+    raise TypeError(f"Invalid type for 'image': {type(image)}")
 
 
 @MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor)
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index a9bd58a29549..ebbad488ecdc 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,4 @@
-from .base import ExternalMultiModalDataDict, MultiModalData, MultiModalPlugin
+from .base import ExternalMultiModalDataDict, MultiModalPlugin
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -11,7 +11,6 @@
 """
 
 __all__ = [
-    "MultiModalData",
     "MultiModalPlugin",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 5a2e28cd7a39..8222e94c3c85 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,6 +1,6 @@
 from abc import ABC, abstractmethod
-from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, Optional,
-                    Tuple, Type, TypedDict, TypeVar, Union)
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Type,
+                    TypedDict, TypeVar, Union)
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
@@ -13,27 +13,6 @@
 
 logger = init_logger(__name__)
 
-
-class MultiModalData:
-    """
-    Base class that contains multi-modal data.
-
-    This is for internal use.
-
-    To add a new modality, add a new file under ``multimodal`` directory.
-
-    In this new file, subclass :class:`~MultiModalData` and
-    :class:`~MultiModalPlugin`.
-
-    Finally, register the new plugin to
-    :const:`vllm.multimodal.MULTIMODAL_REGISTRY` (beyond default plugins).
-    This enables models to call :meth:`MultiModalRegistry.map_input` for
-    the new modality.
-    """
-    pass
-
-
-D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type["nn.Module"])
 
 
@@ -44,13 +23,14 @@ class ExternalMultiModalDataBuiltins(TypedDict, total=False):
 ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str,
                                                                         Any]]
 
-MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]]
+MultiModalInputMapper = Callable[[InputContext, object], Dict[str,
+                                                              "torch.Tensor"]]
 """Return a dictionary to be passed as keyword arguments to
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
 
-class MultiModalPlugin(ABC, Generic[D]):
+class MultiModalPlugin(ABC):
     """
     Base class that defines data processing logic for a specific modality.
 
@@ -63,29 +43,18 @@ class MultiModalPlugin(ABC, Generic[D]):
 
     def __init__(self) -> None:
         self._input_mappers: Dict[Type["nn.Module"],
-                                  MultiModalInputMapper[D]] = {}
+                                  MultiModalInputMapper] = {}
 
     @abstractmethod
-    def get_internal_data_type(self) -> Type[D]:
+    def get_data_key(self) -> str:
         """
-        Get the modality (subclass of :class:`~MultiModalData`) served by
-        this plugin.
-        """
-        raise NotImplementedError
-
-    @abstractmethod
-    def get_external_data_type(self) -> Tuple[str, Type[Any]]:
-        """The data type that this plugin handles. 
-        
-        For `LLM.generate(multi_modal_data={"key": value})` will 
-        be handled by plugin with an external data type of
-        (key, type(value)). 
+        Get the data key corresponding to the modality.
         """
         raise NotImplementedError
 
     @abstractmethod
     def _default_input_mapper(self, ctx: InputContext,
-                              data: D) -> Dict[str, "torch.Tensor"]:
+                              data: object) -> Dict[str, "torch.Tensor"]:
         """Return a dictionary to be passed as keyword arguments to
         :meth:`~torch.nn.Module.forward`. This is similar in concept to
         tokenizers and processors in HuggingFace Transformers.
@@ -94,7 +63,7 @@ def _default_input_mapper(self, ctx: InputContext,
 
     def register_input_mapper(
         self,
-        mapper: Optional[MultiModalInputMapper[D]] = None,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
         Register an input mapper to a model class.
@@ -122,11 +91,13 @@ def wrapper(model_cls: N) -> N:
         return wrapper
 
     def map_input(self, model_config: ModelConfig,
-                  data: D) -> Dict[str, "torch.Tensor"]:
+                  data: object) -> Dict[str, "torch.Tensor"]:
         """
-        Apply an input mapper to a :class:`~MultiModalData` instance passed
+        Apply an input mapper to a data passed
         to the model, transforming the data into a dictionary of model inputs.
 
+        If the data is not something that the mapper expects, throws TypeError.
+
         The model is identified by ``model_config``.
 
         TODO: Add guide [ref: PR #5276]
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index d8d8ee4170eb..a0b4206bf2ee 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -1,5 +1,5 @@
 from functools import lru_cache
-from typing import Dict, Tuple, Type
+from typing import Dict
 
 import torch
 from PIL import Image
@@ -9,35 +9,17 @@
 from vllm.logger import init_logger
 from vllm.transformers_utils.image_processor import get_image_processor
 
-from .base import MultiModalData, MultiModalPlugin
+from .base import MultiModalPlugin
 
 logger = init_logger(__name__)
 
 cached_get_image_processor = lru_cache(get_image_processor)
 
 
-class ImageData(MultiModalData):
-    """
-    Contains a :class:`PIL.Image.Image` object. Requires that a HuggingFace
-    processor is available to the model.
-    """
+class ImagePlugin(MultiModalPlugin):
 
-    def __init__(self, image: Image.Image) -> None:
-        # So that this class can be created inside the Image context manager
-        image.load()
-        self.image = image
-
-    def __repr__(self) -> str:
-        return f"{type(self).__name__}(image={self.image})"
-
-
-class ImagePlugin(MultiModalPlugin[ImageData]):
-
-    def get_internal_data_type(self) -> Type[ImageData]:
-        return ImageData
-
-    def get_external_data_type(self) -> Tuple[str, Type[Image.Image]]:
-        return ("image", Image.Image)
+    def get_data_key(self) -> str:
+        return "image"
 
     def _get_hf_image_processor(self, model_config: ModelConfig):
         return cached_get_image_processor(
@@ -45,19 +27,18 @@ def _get_hf_image_processor(self, model_config: ModelConfig):
             trust_remote_code=model_config.trust_remote_code)
 
     def _default_input_mapper(self, ctx: InputContext,
-                              data: ImageData) -> Dict[str, torch.Tensor]:
+                              data: object) -> Dict[str, torch.Tensor]:
         model_config = ctx.model_config
-        image = data.image
-        if isinstance(image, Image.Image):
+        if isinstance(data, Image.Image):
             image_processor = self._get_hf_image_processor(model_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
             try:
-                return image_processor.preprocess(image, return_tensors="pt") \
+                return image_processor.preprocess(data, return_tensors="pt") \
                     .to(model_config.dtype).data
             except Exception:
-                logger.error("Failed to process image (%s)", image)
+                logger.error("Failed to process image (%s)", data)
                 raise
 
-        raise TypeError(f"Invalid image type: {type(image)}")
+        raise TypeError(f"Invalid type for 'image': {type(data)}")
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a0bd960705e3..d5a78d851917 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,18 +1,17 @@
 import functools
-from typing import Any, Optional, Sequence, Type, TypeVar, Union
+from typing import Optional, Sequence, Type, TypeVar
 
 from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import (ExternalMultiModalDataDict, MultiModalData,
-                   MultiModalInputMapper, MultiModalPlugin)
-from .image import ImageData, ImagePlugin
+from .base import (ExternalMultiModalDataDict, MultiModalInputMapper,
+                   MultiModalPlugin)
+from .image import ImagePlugin
 
 logger = init_logger(__name__)
 
-D = TypeVar("D", bound=MultiModalData)
 N = TypeVar("N", bound=Type[nn.Module])
 
 
@@ -26,109 +25,86 @@ class MultiModalRegistry:
 
     DEFAULT_PLUGINS = (ImagePlugin(), )
 
-    def __init__(self,
-                 *,
-                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
-                 ) -> None:
-        self._plugins_by_internal_data_type = {
-            p.get_internal_data_type(): p
-            for p in plugins
-        }
-        self._plugins_by_external_data_type = {
-            p.get_external_data_type(): p
-            for p in plugins
-        }
-
-    def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
-        data_type = plugin.get_internal_data_type()
-
-        if data_type in self._plugins_by_internal_data_type:
+    def __init__(
+            self,
+            *,
+            plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None:
+        self._plugins = {p.get_data_key(): p for p in plugins}
+
+    def register_plugin(self, plugin: MultiModalPlugin) -> None:
+        data_type_key = plugin.get_data_key()
+
+        if data_type_key in self._plugins:
             logger.warning(
                 "A plugin is already registered for data type %s, "
-                "and will be overwritten by the new plugin %s.", data_type,
+                "and will be overwritten by the new plugin %s.", data_type_key,
                 plugin)
 
-        self._plugins_by_internal_data_type[data_type] = plugin
+        self._plugins[data_type_key] = plugin
+
+    def _get_plugin(self, data_type_key: str):
+        plugin = self._plugins.get(data_type_key)
+        if plugin is not None:
+            return plugin
+
+        msg = f"Unknown multi-modal data type: {data_type_key}"
+        raise NotImplementedError(msg)
 
     def register_image_input_mapper(
         self,
-        mapper: Optional[MultiModalInputMapper[ImageData]] = None,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
         Register an input mapper for image pixel data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input_mapper(ImageData, mapper)
+        return self.register_input_mapper("image", mapper)
 
     def _process_external_input(self, key, value, model_config: ModelConfig):
-        plugin = self._get_plugin_for_external_data_type(key, type(value))
+        plugin = self._plugins.get(key)
         if plugin:
-            return plugin.map_input(model_config,
-                                    plugin.get_internal_data_type()(value))
-        msg = f"Unknown multi-modal data type: {type(value)}"
-        raise NotImplementedError(msg)
-
-    def _get_plugin_for_external_data_type(self, key: str,
-                                           data_type: Type[Any]):
-        for typ in data_type.mro():
-            plugin = self._plugins_by_external_data_type.get((key, typ))
-            if plugin is not None:
-                return plugin
-
-        msg = f"No plugin found for key {key} and type {data_type}"
-        raise NotImplementedError(msg)
-
-    def _get_plugin_for_internal_data_type(self,
-                                           data_type: Type[MultiModalData]):
-        for typ in data_type.mro():
-            plugin = self._plugins_by_internal_data_type.get(typ)
-            if plugin is not None:
-                return plugin
-
-        msg = f"Unknown multi-modal data type: {data_type}"
+            return plugin.map_input(model_config, value)
+        msg = f"Unknown multi-modal data type: {key}"
         raise NotImplementedError(msg)
 
     def register_input_mapper(
         self,
-        data_type: Type[D],
-        mapper: Optional[MultiModalInputMapper[D]] = None,
+        data_type: str,
+        mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
         Register an input mapper for a specific modality to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self._get_plugin_for_internal_data_type(data_type) \
-            .register_input_mapper(mapper)
+        plugin = self._plugins.get(data_type)
+        if not plugin:
+            msg = f"Unknown multi-modal data type: {data_type}"
+            raise NotImplementedError(msg)
+        return plugin.register_input_mapper(mapper)
 
     def register_image_input(self,
-                             mapper: Optional[
-                                 MultiModalInputMapper[ImageData]] = None):
+                             mapper: Optional[MultiModalInputMapper] = None):
         """
         Register an input mapper for image pixel data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """
-        return self.register_input_mapper(ImageData, mapper)
+        return self.register_input_mapper("image", mapper)
 
     def map_input(self, model_config: ModelConfig,
-                  data: Union[MultiModalData, ExternalMultiModalDataDict]):
+                  data: ExternalMultiModalDataDict):
         """
-        Apply an input mapper to a :class:`~MultiModalData` instance passed
-        to the model.
+        Apply an input mapper to the data passed to the model.
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
-        if isinstance(data, MultiModalData):
-            return self._get_plugin_for_internal_data_type(type(data)) \
-                .map_input(model_config, data)
-        else:
-            result_list = [
-                self._process_external_input(k, v, model_config)
-                for k, v in data.items()
-            ]
-            return {k: v for d in result_list for k, v in d.items()}
+        result_list = [
+            self._process_external_input(k, v, model_config)
+            for k, v in data.items()
+        ]
+        return {k: v for d in result_list for k, v in d.items()}
 
     def create_input_mapper(self, model_config: ModelConfig):
         """
diff --git a/vllm/sequence.py b/vllm/sequence.py
index c26f778674e5..69d68a6312fe 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from vllm.inputs import LLMInputs
-    from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData
+    from vllm.multimodal import ExternalMultiModalDataDict
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -432,7 +432,7 @@ def prompt_token_ids(self) -> List[int]:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalData"]:
+    def multi_modal_data(self) -> Optional["ExternalMultiModalDataDict"]:
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data

From 31b8b09f66ca8412af1bf5d6a8cd14607aebc817 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 08:28:44 -0700
Subject: [PATCH 143/181] rm external

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 docs/source/dev/multimodal/multimodal_index.rst |  2 +-
 docs/source/models/vlm.rst                      |  2 +-
 tests/conftest.py                               |  6 +++---
 tests/models/test_llava.py                      |  2 +-
 tests/models/test_llava_next.py                 |  2 +-
 tests/models/test_phi3v.py                      |  2 +-
 tests/spec_decode/e2e/conftest.py               |  4 ++--
 vllm/entrypoints/openai/serving_chat.py         | 10 +++++-----
 vllm/inputs/data.py                             | 10 +++++-----
 vllm/inputs/registry.py                         |  6 +++---
 vllm/multimodal/__init__.py                     |  4 ++--
 vllm/multimodal/base.py                         |  5 ++---
 vllm/multimodal/registry.py                     | 11 ++++-------
 vllm/sequence.py                                |  8 ++++----
 14 files changed, 35 insertions(+), 39 deletions(-)

diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index d551eb899acc..8aa6bd48d7b2 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -29,7 +29,7 @@ Registry
 Base Classes
 ------------
 
-.. autoclass:: vllm.multimodal.ExternalMultiModalDataDict
+.. autoclass:: vllm.multimodal.MultiModalDataDict
     :members:
     :show-inheritance:
 
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index d4bb86ecf576..fe11af0b90e8 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.ExternalMultiModalDataDict`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.MultiModalDataDict`.
 
 .. code-block:: python
 
diff --git a/tests/conftest.py b/tests/conftest.py
index dca87149ce32..cc318ee46994 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -22,7 +22,7 @@
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import ExternalMultiModalDataDict
+from vllm.multimodal import MultiModalDataDict
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -428,7 +428,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[ExternalMultiModalDataDict]] = None,
+        images: Optional[List[MultiModalDataDict]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -477,7 +477,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[ExternalMultiModalDataDict]] = None,
+        images: Optional[List[MultiModalDataDict]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 7f4a3a597874..c6313c52e4e3 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -75,7 +75,7 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index b03e00923ca2..e9babba13c47 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -82,7 +82,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 5809e6f83755..917bdbf94ab9 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -84,7 +84,7 @@ def run_test(
 
     All the image fixtures for the test is under tests/images.
     For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide ExternalMultiModalDataDict objects 
+    For vllm runner, we provide MultiModalDataDict objects 
     and corresponding vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index f75caef6a5b5..8ad8e9cb81ff 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -11,7 +11,7 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
-from vllm.multimodal import ExternalMultiModalDataDict
+from vllm.multimodal import MultiModalDataDict
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import Logprob
@@ -91,7 +91,7 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[ExternalMultiModalDataDict] = None,
+        multi_modal_data: Optional[MultiModalDataDict] = None,
     ) -> List[RequestOutput]:
 
         if prompts is None:
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 55d5c383a6ee..033dcaf0724c 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -26,7 +26,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
-from vllm.multimodal import ExternalMultiModalDataDict
+from vllm.multimodal import MultiModalDataDict
 from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
@@ -46,7 +46,7 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
-    mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(
+    mm_futures: List[Awaitable[MultiModalDataDict]] = field(
         default_factory=list)
 
 
@@ -102,7 +102,7 @@ def _parse_chat_message_content_parts(
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
-        mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = []
+        mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
         vlm_config: Optional[VisionLanguageConfig] = getattr(
             self.engine.engine, "vision_language_config", None)
@@ -205,7 +205,7 @@ async def create_chat_completion(
 
         try:
             conversation: List[ConversationMessage] = []
-            mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = []
+            mm_futures: List[Awaitable[MultiModalDataDict]] = []
 
             for msg in request.messages:
                 chat_parsed_result = self._parse_chat_message_content(msg)
@@ -222,7 +222,7 @@ async def create_chat_completion(
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
-        mm_data: Optional[ExternalMultiModalDataDict] = None
+        mm_data: Optional[MultiModalDataDict] = None
         try:
             if len(mm_futures):
                 # since we support only single mm data currently
diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py
index df8c38ead21a..c6381fcc01e5 100644
--- a/vllm/inputs/data.py
+++ b/vllm/inputs/data.py
@@ -4,7 +4,7 @@
 from typing_extensions import NotRequired
 
 if TYPE_CHECKING:
-    from vllm.multimodal import ExternalMultiModalDataDict
+    from vllm.multimodal import MultiModalDataDict
 
 
 class ParsedText(TypedDict):
@@ -72,7 +72,7 @@ class TextPrompt(TypedDict):
     prompt: str
     """The input text to be tokenized before passing to the model."""
 
-    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -85,7 +85,7 @@ class TokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """A list of token IDs to pass to the model."""
 
-    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict):
     prompt_token_ids: List[int]
     """The token IDs of the prompt."""
 
-    multi_modal_data: NotRequired["ExternalMultiModalDataDict"]
+    multi_modal_data: NotRequired["MultiModalDataDict"]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
@@ -136,7 +136,7 @@ class LLMInputs(TypedDict):
     The original prompt text corresponding to the token IDs, if available.
     """
 
-    multi_modal_data: NotRequired[Optional["ExternalMultiModalDataDict"]]
+    multi_modal_data: NotRequired[Optional["MultiModalDataDict"]]
     """
     Optional multi-modal data to pass to the model,
     if the model supports it.
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 85552db4aefe..3e28733383cb 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -12,7 +12,7 @@
 
 if TYPE_CHECKING:
     from vllm.config import ModelConfig, VisionLanguageConfig
-    from vllm.multimodal import ExternalMultiModalDataDict
+    from vllm.multimodal import MultiModalDataDict
     from vllm.sequence import SequenceData
 
 logger = init_logger(__name__)
@@ -67,7 +67,7 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
 
 DummyDataFactory = Callable[[InputContext, int],
                             Tuple["SequenceData",
-                                  Optional["ExternalMultiModalDataDict"]]]
+                                  Optional["MultiModalDataDict"]]]
 """
 Create dummy data to be inputted into the model.
 
@@ -95,7 +95,7 @@ def _default_dummy_data_factory(
         self,
         ctx: InputContext,
         seq_len: int,
-    ) -> Tuple["SequenceData", Optional["ExternalMultiModalDataDict"]]:
+    ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]:
         """
         The default dummy data factory represents the longest possible text
         that can be inputted to the model.
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index ebbad488ecdc..256eadd2d7df 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,4 @@
-from .base import ExternalMultiModalDataDict, MultiModalPlugin
+from .base import MultiModalDataDict, MultiModalPlugin
 from .registry import MultiModalRegistry
 
 MULTIMODAL_REGISTRY = MultiModalRegistry()
@@ -14,5 +14,5 @@
     "MultiModalPlugin",
     "MULTIMODAL_REGISTRY",
     "MultiModalRegistry",
-    "ExternalMultiModalDataDict",
+    "MultiModalDataDict",
 ]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 8222e94c3c85..04d02c70a381 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -16,12 +16,11 @@
 N = TypeVar("N", bound=Type["nn.Module"])
 
 
-class ExternalMultiModalDataBuiltins(TypedDict, total=False):
+class MultiModalDataBuiltins(TypedDict, total=False):
     image: Union["Image.Image", "torch.Tensor"]
 
 
-ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str,
-                                                                        Any]]
+MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
 
 MultiModalInputMapper = Callable[[InputContext, object], Dict[str,
                                                               "torch.Tensor"]]
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index d5a78d851917..a30a5b9c3afc 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -6,8 +6,7 @@
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import (ExternalMultiModalDataDict, MultiModalInputMapper,
-                   MultiModalPlugin)
+from .base import MultiModalDataDict, MultiModalInputMapper, MultiModalPlugin
 from .image import ImagePlugin
 
 logger = init_logger(__name__)
@@ -61,7 +60,7 @@ def register_image_input_mapper(
         """
         return self.register_input_mapper("image", mapper)
 
-    def _process_external_input(self, key, value, model_config: ModelConfig):
+    def _process_input(self, key, value, model_config: ModelConfig):
         plugin = self._plugins.get(key)
         if plugin:
             return plugin.map_input(model_config, value)
@@ -93,16 +92,14 @@ def register_image_input(self,
         """
         return self.register_input_mapper("image", mapper)
 
-    def map_input(self, model_config: ModelConfig,
-                  data: ExternalMultiModalDataDict):
+    def map_input(self, model_config: ModelConfig, data: MultiModalDataDict):
         """
         Apply an input mapper to the data passed to the model.
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
         result_list = [
-            self._process_external_input(k, v, model_config)
-            for k, v in data.items()
+            self._process_input(k, v, model_config) for k, v in data.items()
         ]
         return {k: v for d in result_list for k, v in d.items()}
 
diff --git a/vllm/sequence.py b/vllm/sequence.py
index 69d68a6312fe..33b831b48003 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -14,7 +14,7 @@
 
 if TYPE_CHECKING:
     from vllm.inputs import LLMInputs
-    from vllm.multimodal import ExternalMultiModalDataDict
+    from vllm.multimodal import MultiModalDataDict
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -255,7 +255,7 @@ def prompt_token_ids(self) -> List[int]:
         return self.inputs["prompt_token_ids"]
 
     @property
-    def multi_modal_data(self) -> "ExternalMultiModalDataDict":
+    def multi_modal_data(self) -> "MultiModalDataDict":
         return self.inputs.get("multi_modal_data") or {}
 
     @property
@@ -432,7 +432,7 @@ def prompt_token_ids(self) -> List[int]:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["ExternalMultiModalDataDict"]:
+    def multi_modal_data(self) -> Optional["MultiModalDataDict"]:
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data
@@ -614,7 +614,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional["ExternalMultiModalDataDict"] = None,
+        multi_modal_data: Optional["MultiModalDataDict"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:

From a4b5617a3084b37ab5d3e55fb10312c06b3a6c01 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 08:56:54 -0700
Subject: [PATCH 144/181] comments

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 vllm/engine/arg_utils.py    | 2 +-
 vllm/multimodal/base.py     | 2 +-
 vllm/multimodal/registry.py | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index b8200ffad391..c9a31c975bea 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -614,7 +614,7 @@ def create_engine_config(self, ) -> EngineConfig:
             raise ValueError(
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
-        if self.image_token_id:
+        if self.image_token_id is not None:
             if (not self.image_input_shape or not self.image_feature_size):
                 raise ValueError(
                     'Specify `image_input_shape` and '
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 04d02c70a381..65dcd686ed08 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -17,7 +17,7 @@
 
 
 class MultiModalDataBuiltins(TypedDict, total=False):
-    image: Union["Image.Image", "torch.Tensor"]
+    image: "Image.Image"
 
 
 MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index a30a5b9c3afc..9a297baa7011 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -60,7 +60,8 @@ def register_image_input_mapper(
         """
         return self.register_input_mapper("image", mapper)
 
-    def _process_input(self, key, value, model_config: ModelConfig):
+    def _process_input(self, key: str, value: object,
+                       model_config: ModelConfig):
         plugin = self._plugins.get(key)
         if plugin:
             return plugin.map_input(model_config, value)

From 045674d132891684c6de3acccf456c0e0bc2a036 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 11:08:04 -0700
Subject: [PATCH 145/181] fix dist gpu test.

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 tests/conftest.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index cc318ee46994..1aa054f05a62 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,8 +5,8 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
-                    TypeVar)
+from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
+                    TypedDict, TypeVar)
 
 import pytest
 import torch
@@ -22,10 +22,13 @@
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.multimodal import MultiModalDataDict
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
+if TYPE_CHECKING:
+    # it will call torch.cuda.device_count()
+    from vllm.multimodal import MultiModalDataDict
+
 logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
@@ -428,7 +431,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[List[MultiModalDataDict]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -477,7 +480,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[List[MultiModalDataDict]] = None,
+        images: Optional[List["MultiModalDataDict"]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)

From c8fa1505a894801f70fb42b4ec1c52edc7dd43f9 Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 17:41:04 -0700
Subject: [PATCH 146/181] address comments

---
 vllm/entrypoints/openai/serving_chat.py    | 11 +++++------
 vllm/model_executor/model_loader/loader.py |  5 ++---
 vllm/model_executor/models/llava.py        |  1 +
 vllm/multimodal/utils.py                   |  6 ++++++
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 033dcaf0724c..d978199b040f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -27,7 +27,8 @@
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
 from vllm.multimodal import MultiModalDataDict
-from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt
+from vllm.multimodal.utils import (async_get_and_parse_image,
+                                   get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.tracing import (contains_trace_headers, extract_trace_headers,
@@ -127,10 +128,6 @@ def _parse_chat_message_content_parts(
                         "'image_url.detail' is currently not supported and "
                         "will be ignored.")
 
-                async def async_get_and_parse_image(image_url: str):
-                    image = await ImageFetchAiohttp.fetch_image(image_url)
-                    return {"image": image}
-
                 mm_future = async_get_and_parse_image(image_url["url"])
                 mm_futures.append(mm_future)
 
@@ -141,7 +138,9 @@ async def async_get_and_parse_image(image_url: str):
 
         if vlm_config is not None and len(mm_futures):
 
-            assert len(mm_futures) == 1, "Multiple images is not supported."
+            assert len(
+                mm_futures
+            ) == 1, "Multiple 'image_url' input is currently not supported."
             (image_token_prompt,
              image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
 
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 02f12ecffa74..2322e9d39d03 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -84,9 +84,8 @@ def _get_model_initialization_kwargs(
 
     if supports_vision(model_class):
         if vlm_config is None:
-            raise ValueError("Provide vision "
-                             "related configurations through LLM entrypoint "
-                             "or engine arguments.")
+            raise ValueError("Provide vision related configurations "
+                             "through LLM entrypoint or engine arguments.")
 
         extra_kwargs["vlm_config"] = vlm_config
 
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 5e48c5530a0c..e0134c5c452d 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -110,6 +110,7 @@ def __init__(self,
         self.config = config
         self.vlm_config = vlm_config
 
+        # TODO: Optionally initializes this for supporting embeddings.
         self.vision_tower = CLIPVisionModel(config.vision_config)
 
         self.multi_modal_projector = LlavaMultiModalProjector(
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index aad9822db78d..19e0ab63ad1f 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,6 +8,7 @@
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
+from vllm.vllm.multimodal.base import MultiModalDataDict
 
 
 class ImageFetchAiohttp:
@@ -86,3 +87,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
         raise ValueError(
             f"Unsupported model type: {config.hf_config.model_type}")
     return full_prompt
+
+
+async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict:
+    image = await ImageFetchAiohttp.fetch_image(image_url)
+    return {"image": image}

From 58ab8e9f63930a23af50f381eb3a63faf9214eb8 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 01:04:38 +0000
Subject: [PATCH 147/181] Further avoid cuda init

---
 vllm/inputs/registry.py     | 9 +++++----
 vllm/multimodal/registry.py | 9 +++++----
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index ea6d72ce0f16..8fafb05db15f 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -3,7 +3,6 @@
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
 
-from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
@@ -11,6 +10,8 @@
 from .data import LLMInputs
 
 if TYPE_CHECKING:
+    from torch import nn
+
     from vllm.config import ModelConfig, VisionLanguageConfig
     from vllm.multimodal import MultiModalData
     from vllm.sequence import SequenceData
@@ -63,7 +64,7 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
         return hf_config
 
 
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=Type["nn.Module"])
 
 DummyDataFactory = Callable[[InputContext, int],
                             Tuple["SequenceData", Optional["MultiModalData"]]]
@@ -85,9 +86,9 @@ class InputRegistry:
     """
 
     def __init__(self) -> None:
-        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
+        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
                                                   DummyDataFactory] = {}
-        self._input_processors_by_model_type: Dict[Type[nn.Module],
+        self._input_processors_by_model_type: Dict[Type["nn.Module"],
                                                    InputProcessor] = {}
 
     def _default_dummy_data_factory(
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index abc88e4f9a9d..9b83daaa5477 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,7 +1,5 @@
 import functools
-from typing import Any, Optional, Sequence, Type, TypeVar
-
-from torch import nn
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Type, TypeVar
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
@@ -10,10 +8,13 @@
 from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
                     ImagePixelPlugin)
 
+if TYPE_CHECKING:
+    from torch import nn
+
 logger = init_logger(__name__)
 
 D = TypeVar("D", bound=MultiModalData)
-N = TypeVar("N", bound=Type[nn.Module])
+N = TypeVar("N", bound=Type["nn.Module"])
 
 
 class MultiModalRegistry:

From 6975caa3c56248b67040aa0d1ae6d3ba1478cb85 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 01:22:26 +0000
Subject: [PATCH 148/181] Add warnings for repeated image tokens

---
 vllm/model_executor/models/phi3v.py | 21 ++++++++++++++++++++-
 vllm/multimodal/image.py            | 11 +++++++++++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index 17f5802cd0a9..e5733bbf6c36 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -13,6 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import re
 from functools import lru_cache
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict
 
@@ -24,6 +25,7 @@
 from vllm.attention import AttentionMetadata
 from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
 from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs
+from vllm.logger import init_logger
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
@@ -42,6 +44,8 @@
                    input_processor_for_clip)
 from .interfaces import SupportsVision
 
+logger = init_logger(__name__)
+
 _KEYS_TO_MODIFY_MAPPING = {
     "model.vision_embed_tokens": "vision_embed_tokens",
 }
@@ -386,6 +390,21 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
         image_features = multi_modal_data.image_features
         image_feature_size = image_features.shape[-2]
 
+    prompt = llm_inputs.get("prompt")
+    if prompt is None:
+        new_prompt = None
+    else:
+        if prompt.count("<|image|>") > 0:
+            logger.warning("Please follow the prompt format that is "
+                           "recommended on HuggingFace which does not involve "
+                           "repeating <|image|> tokens.")
+        elif len(re.findall(r"(<\|image_\d+\|>)+", prompt)) > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
+        new_prompt = prompt
+
     prompt_token_ids = llm_inputs["prompt_token_ids"]
     image_1_token_ids = _get_image_placeholder_token_ids(model_config, idx=1)
 
@@ -402,7 +421,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
 
     # NOTE: Create a defensive copy of the original inputs
     llm_inputs = LLMInputs(prompt_token_ids=new_token_ids,
-                           prompt=llm_inputs.get("prompt"),
+                           prompt=new_prompt,
                            multi_modal_data=multi_modal_data)
 
     return input_processor_for_clip(
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 33b07f33714d..92a4aa2a96f1 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -64,6 +64,17 @@ def repeat_and_pad_image_tokens(
                 pad_token_right=pad_token_str_right,
             ))
 
+        image_token_count = prompt.count(image_token_str)
+        if image_token_count > 16:
+            logger.warning(
+                "Please follow the prompt format that is "
+                "recommended on HuggingFace which does not involve "
+                "repeating %s tokens.", image_token_str)
+        elif image_token_count > 1:
+            logger.warning("Multiple image input is not supported yet, "
+                           "so any extra image tokens will be treated "
+                           "as plain text.")
+
         # The image tokens are removed to be consistent with HuggingFace
         new_prompt = prompt.replace(image_token_str, replacement_str, 1)
 

From b1f1813aac52134bda64df968798a07084e6c31d Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 18:49:45 -0700
Subject: [PATCH 149/181] docs

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com>
---
 docs/requirements-docs.txt                      | 16 ++++------------
 docs/source/dev/multimodal/multimodal_index.rst |  6 ++++--
 docs/source/models/vlm.rst                      |  7 ++++++-
 vllm/multimodal/registry.py                     |  2 +-
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt
index ed569816200e..db076b2d801d 100644
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
@@ -1,13 +1,5 @@
-sphinx == 6.2.1
-sphinx-book-theme == 1.0.1
-sphinx-copybutton == 0.5.2
-myst-parser == 2.0.0
+sphinx==6.2.1
+sphinx-book-theme==1.0.1
+sphinx-copybutton==0.5.2
+myst-parser==2.0.0
 sphinx-argparse
-
-# packages to install to build the documentation
-pydantic
--f https://download.pytorch.org/whl/cpu
-torch
-py-cpuinfo
-transformers
-openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
index 8aa6bd48d7b2..4d5fb3246b68 100644
--- a/docs/source/dev/multimodal/multimodal_index.rst
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
 which allows you to pass in multi-modal input alongside text and token prompts.
 
 By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
-you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
-as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
+
+# TODO: Add more instructions on how to do that once embeddings is in.
 
 Module Contents
 +++++++++++++++
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index fe11af0b90e8..053f5b8609ce 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -48,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
 To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
-* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.MultiModalDataDict`.
+* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. 
+
+.. note::
+
+   ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
+    :class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
 
 .. code-block:: python
 
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 9a297baa7011..a09a80f89f4b 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -54,7 +54,7 @@ def register_image_input_mapper(
         mapper: Optional[MultiModalInputMapper] = None,
     ):
         """
-        Register an input mapper for image pixel data to a model class.
+        Register an input mapper for image data to a model class.
 
         See :meth:`MultiModalPlugin.register_input_mapper` for more details.
         """

From b8b636d83f67584c31b898bd4cb6b0cd3a45872b Mon Sep 17 00:00:00 2001
From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com>
Date: Mon, 1 Jul 2024 19:11:12 -0700
Subject: [PATCH 150/181] Update vllm/multimodal/base.py

Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
---
 vllm/multimodal/base.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 65dcd686ed08..e41a814b07af 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -21,6 +21,11 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 
 
 MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]]
+"""
+A dictionary containing an item for each modality type to input.
+
+The data belonging to each modality is converted into keyword arguments to the model by the corresponding mapper. By default, the mapper of the corresponding plugin with the same modality key is applied.
+"""
 
 MultiModalInputMapper = Callable[[InputContext, object], Dict[str,
                                                               "torch.Tensor"]]

From 2c1d2912fa9ea5a6af42a11192286f9bc2a4b63f Mon Sep 17 00:00:00 2001
From: Xiaowei Jiang <xwjiang2010@gmail.com>
Date: Mon, 1 Jul 2024 19:20:45 -0700
Subject: [PATCH 151/181] format Signed-off-by: Xiaowei Jiang
 <xwjiang2010@gmail.com>

---
 vllm/multimodal/base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index e41a814b07af..558cd1175298 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -24,7 +24,9 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 """
 A dictionary containing an item for each modality type to input.
 
-The data belonging to each modality is converted into keyword arguments to the model by the corresponding mapper. By default, the mapper of the corresponding plugin with the same modality key is applied.
+The data belonging to each modality is converted into keyword arguments 
+to the model by the corresponding mapper. By default, the mapper of 
+the corresponding plugin with the same modality key is applied.
 """
 
 MultiModalInputMapper = Callable[[InputContext, object], Dict[str,

From b6401d33e49b35675db9ef5f53c5faa860f1b579 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 02:55:32 +0000
Subject: [PATCH 152/181] Reword

---
 vllm/model_executor/models/phi3v.py | 2 +-
 vllm/multimodal/image.py            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index e5733bbf6c36..88ee711ea1b4 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -396,7 +396,7 @@ def input_processor_for_phi3v(ctx: InputContext, llm_inputs: LLMInputs):
     else:
         if prompt.count("<|image|>") > 0:
             logger.warning("Please follow the prompt format that is "
-                           "recommended on HuggingFace which does not involve "
+                           "documented on HuggingFace which does not involve "
                            "repeating <|image|> tokens.")
         elif len(re.findall(r"(<\|image_\d+\|>)+", prompt)) > 1:
             logger.warning("Multiple image input is not supported yet, "
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 92a4aa2a96f1..5781d6197fd5 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -68,7 +68,7 @@ def repeat_and_pad_image_tokens(
         if image_token_count > 16:
             logger.warning(
                 "Please follow the prompt format that is "
-                "recommended on HuggingFace which does not involve "
+                "documented on HuggingFace which does not involve "
                 "repeating %s tokens.", image_token_str)
         elif image_token_count > 1:
             logger.warning("Multiple image input is not supported yet, "

From 89f1103b8949743d694273b5882e91b4120f840e Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 03:47:57 +0000
Subject: [PATCH 153/181] Remove useless test

---
 tests/multimodal/test_mapper.py | 33 ---------------------------------
 1 file changed, 33 deletions(-)

diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 19776ea0e594..7009ccc6ed2c 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -83,36 +83,3 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
 
             assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
-
-
-@pytest.mark.xfail(
-    reason="Example image pixels were not processed using HuggingFace")
-@pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(image_assets, dtype):
-    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-
-    model_config = ModelConfig(
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        tokenizer_mode="auto",
-        trust_remote_code=False,
-        seed=0,
-        dtype=dtype,
-        revision=None,
-    )
-    for asset in image_assets:
-        image_result = MULTIMODAL_REGISTRY.map_input(
-            model_config,
-            {"image": asset.pil_image},
-        )
-        tensor_result = MULTIMODAL_REGISTRY.map_input(
-            model_config,
-            {"image": asset.pil_image},
-        )
-
-        assert image_result.keys() == tensor_result.keys()
-        for key, image_arr in image_result.items():
-            tensor_arr: np.ndarray = tensor_result[key].numpy()
-
-            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
-            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"

From 47fbdba4dc449d97f43b8dbe4455451b5058ea20 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 03:57:23 +0000
Subject: [PATCH 154/181] Unify test API between HfRunner and VllmRunner

---
 tests/conftest.py               | 27 ++++++-----------
 tests/models/test_llava.py      | 36 +++++++----------------
 tests/models/test_llava_next.py | 32 +++++++--------------
 tests/models/test_phi3v.py      | 51 ++++++---------------------------
 4 files changed, 36 insertions(+), 110 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 376255541e0b..61628a125bc5 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,9 +6,8 @@
 from dataclasses import dataclass
 from functools import cached_property
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional
-from typing import Sequence as GenericSequence
-from typing import Tuple, TypedDict, TypeVar
+from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
+                    TypeVar)
 
 import pytest
 import torch
@@ -27,10 +26,6 @@
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
-if TYPE_CHECKING:
-    # it will call torch.cuda.device_count()
-    from vllm.multimodal import MultiModalDataDict
-
 logger = init_logger(__name__)
 
 _TEST_DIR = os.path.dirname(__file__)
@@ -55,12 +50,6 @@ class ImageAsset:
     def pil_image(self) -> Image.Image:
         return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
 
-    def for_hf(self) -> Image.Image:
-        return self.pil_image
-
-    def for_vllm(self) -> Dict[str, Any]:
-        return {"image": self.pil_image}
-
 
 class _ImageAssetPrompts(TypedDict):
     stop_sign: str
@@ -466,7 +455,7 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[GenericSequence["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
@@ -474,7 +463,7 @@ def generate(
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}
 
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
@@ -497,7 +486,7 @@ def generate_w_logprobs(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[GenericSequence["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
@@ -507,7 +496,7 @@ def generate_w_logprobs(
         inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
         if images is not None:
             for i, image in enumerate(images):
-                inputs[i]["multi_modal_data"] = image
+                inputs[i]["multi_modal_data"] = {"image": image}
 
         req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
@@ -524,7 +513,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[GenericSequence["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
@@ -536,7 +525,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
-        images: Optional[GenericSequence["MultiModalDataDict"]] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 4d050cec3c7d..9b953457f795 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -94,10 +94,15 @@ def run_test(
     """
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
-    from vllm.multimodal.image import ImagePixelData
     from vllm.multimodal.utils import rescale_image_size
 
     model_id, vlm_config = model_and_config
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
@@ -111,33 +116,12 @@ def run_test(
                      distributed_executor_backend=distributed_executor_backend,
                      enforce_eager=True,
                      **vlm_config.as_cli_args_dict()) as vllm_model:
-        hf_images = [asset.for_hf() for asset in image_assets]
-        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
-        # we must put it inside the vllm_runner context manager
-        # i.e. after creating vLLM instance.
-        vllm_images = [asset.for_vllm() for asset in image_assets]
-
-        image_inputs_per_image = [[(
-            prompt,
-            rescale_image_size(hf_image, factor),
-            ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-        ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
-            hf_images, vllm_images, HF_IMAGE_PROMPTS)]
-        hf_inputs_per_image = [(
-            [prompt for prompt, hf_image, vllm_image in image_inputs],
-            [hf_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_image]
-        vllm_inputs_per_image = [(
-            [prompt for prompt, hf_image, vllm_image in image_inputs],
-            [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_image]
-
         vllm_outputs_per_image = [
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
@@ -145,8 +129,8 @@ def run_test(
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images)
-            for prompts, hf_images in hf_inputs_per_image
+                                                    images=images)
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 5046bd65be8d..073cba4a1327 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -112,27 +112,15 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     """
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
-    from vllm.multimodal.image import ImagePixelData
     from vllm.multimodal.utils import rescale_image_size
 
     model_id, vlm_config = model_and_config
-    hf_images = [asset.for_hf() for asset in image_assets]
-    vllm_images = [asset.for_vllm() for asset in image_assets]
-
-    image_inputs_per_image = [[(
-        prompt,
-        rescale_image_size(hf_image, factor),
-        ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-    ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
-        hf_images, vllm_images, HF_IMAGE_PROMPTS)]
-    hf_inputs_per_image = [(
-        [prompt for prompt, hf_image, vllm_image in image_inputs],
-        [hf_image for prompt, hf_image, vllm_image in image_inputs],
-    ) for image_inputs in image_inputs_per_image]
-    vllm_inputs_per_image = [(
-        [prompt for prompt, hf_image, vllm_image in image_inputs],
-        [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-    ) for image_inputs in image_inputs_per_image]
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
@@ -144,8 +132,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             vllm_model.generate_greedy_logprobs(prompts,
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
-                                                images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_image
+                                                images=images)
+            for prompts, images in inputs_per_image
         ]
 
     with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
@@ -153,8 +141,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
             hf_model.generate_greedy_logprobs_limit(prompts,
                                                     max_tokens,
                                                     num_logprobs=num_logprobs,
-                                                    images=hf_images)
-            for prompts, hf_images in hf_inputs_per_image
+                                                    images=images)
+            for prompts, images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 9c32652df1f6..5a6e235b4884 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -97,56 +97,21 @@ def run_test(
     """
     # don't put this import at the top level
     # it will call torch.cuda.device_count()
-    from vllm.multimodal.image import ImagePixelData
     from vllm.multimodal.utils import rescale_image_size
 
     model_id, vlm_config = model_and_config
+    images = [asset.pil_image for asset in image_assets]
+
+    inputs_per_image = [(
+        [prompt for _ in size_factors],
+        [rescale_image_size(image, factor) for factor in size_factors],
+    ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
 
     # NOTE: take care of the order. run vLLM first, and then run HF.
     # vLLM needs a fresh new process without cuda initialization.
     # if we run HF first, the cuda initialization will be done and it
     # will hurt multiprocessing backend with fork method (the default method).
 
-    with vllm_runner(model_id,
-                     max_model_len=2048,
-                     dtype=dtype,
-                     tensor_parallel_size=tensor_parallel_size,
-                     enforce_eager=True,
-                     distributed_executor_backend=distributed_executor_backend,
-                     **vlm_config.as_cli_args_dict()) as vllm_model:
-
-        hf_images = [asset.for_hf() for asset in image_assets]
-        # NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
-        # we must put it inside the vllm_runner context manager
-        # i.e. after creating vLLM instance.
-
-        vllm_images = [asset.for_vllm() for asset in image_assets]
-
-        vllm_image_prompts = [
-            p.replace("<|image_1|>",
-                      "<|image|>" * vlm_config.image_feature_size + "<s>")
-            for p in HF_IMAGE_PROMPTS
-        ]
-
-        image_inputs_per_image = [[(
-            prompt,
-            rescale_image_size(hf_image, factor),
-            ImagePixelData(image=rescale_image_size(vllm_image.image, factor)),
-        ) for factor in size_factors] for hf_image, vllm_image, prompt in zip(
-            hf_images, vllm_images, HF_IMAGE_PROMPTS)]
-        hf_inputs_per_image = [(
-            [prompt for prompt, hf_image, vllm_image in image_inputs],
-            [hf_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_image]
-        vllm_inputs_per_image = [(
-            [prompt for prompt, hf_image, vllm_image in image_inputs],
-            [vllm_image for prompt, hf_image, vllm_image in image_inputs],
-        ) for image_inputs in image_inputs_per_image]
-
-        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                                  max_tokens,
-                                                  images=vllm_images)
-
     # max_model_len should be greater than image_feature_size
     with vllm_runner(model_id,
                      max_model_len=4096,
@@ -160,7 +125,7 @@ def run_test(
                                                 max_tokens,
                                                 num_logprobs=num_logprobs,
                                                 images=vllm_images)
-            for prompts, vllm_images in vllm_inputs_per_image
+            for prompts, vllm_images in inputs_per_image
         ]
 
     # use eager mode for hf runner, since phi3_v didn't work with flash_attn
@@ -174,7 +139,7 @@ def run_test(
                                                     num_logprobs=num_logprobs,
                                                     images=hf_images,
                                                     eos_token_id=eos_token_id)
-            for prompts, hf_images in hf_inputs_per_image
+            for prompts, hf_images in inputs_per_image
         ]
 
     for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,

From c1c5a4d8cd92ed8bc2ee6525ac3be58daa7618b3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 04:00:44 +0000
Subject: [PATCH 155/181] Fix import error

---
 vllm/multimodal/base.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 95da47e3ec88..c8daf9ea1df2 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -35,7 +35,7 @@ class MultiModalData:
     pass
 
 
-BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
+BatchedTensors = Union["torch.Tensor", List["torch.Tensor"]]
 """
 If each input tensor in the batch has the same size, this is a single batched
 tensor; otherwise, this is a list of tensors with one element per batch.
@@ -47,7 +47,7 @@ class _MultiModalInputsBase(UserDict):
         pass
 else:
 
-    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
+    class _MultiModalInputsBase(UserDict[str, "torch.Tensor"]):
         pass
 
 
@@ -59,10 +59,13 @@ class MultiModalInputs(_MultiModalInputsBase):
 
     @staticmethod
     def try_concat(
-        tensors: List[torch.Tensor],
+        tensors: List["torch.Tensor"],
         *,
         device: "torch.types.Device",
     ) -> BatchedTensors:
+        # Avoid initializing CUDA too early
+        import torch
+        
         unbatched_shape = tensors[0].shape[1:]
 
         for tensor in tensors:
@@ -84,7 +87,7 @@ def batch(
 
         keys = inputs_list[0].keys()
 
-        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
+        item_lists: Dict[str, List["torch.Tensor"]] = defaultdict(list)
 
         for inputs in inputs_list:
             if inputs.keys() != keys:

From fde4b2539c471dc2664bbb3dc97f9b1ab785a11d Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 04:21:23 +0000
Subject: [PATCH 156/181] Fix attribute error

---
 vllm/model_executor/models/phi3v.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py
index f4d0df843457..f3c64d66f36b 100644
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -444,12 +444,12 @@ def __init__(self,
         self.config = config
         self.vlm_config = vlm_config
 
+        self.model = LlamaModel(config, cache_config, quant_config)
+
         # TODO: Optionally initializes this for supporting embeddings.
         self.vision_embed_tokens = Phi3HDImageEmbedding(
             vlm_config, config, self.model.embed_tokens)
 
-        self.model = LlamaModel(config, cache_config, quant_config)
-
         self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size)
         self.logits_processor = LogitsProcessor(config.vocab_size)
         self.sampler = Sampler()

From 4278fed0342c78ffcbc49b59bac4d4e1215504a3 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 1 Jul 2024 22:20:23 -0700
Subject: [PATCH 157/181] fix import error

---
 vllm/multimodal/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 19e0ab63ad1f..321b51e5a883 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -8,7 +8,7 @@
 
 from vllm.config import ModelConfig
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
-from vllm.vllm.multimodal.base import MultiModalDataDict
+from vllm.multimodal.base import MultiModalDataDict
 
 
 class ImageFetchAiohttp:

From d9a2908528b8174fa4c8e9b230ac292ac6d973ed Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 1 Jul 2024 22:23:27 -0700
Subject: [PATCH 158/181] update llava next example

---
 examples/llava_next_example.py | 61 +++++++++++++++++++---------------
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index e90a86abe41c..3c39590e7fb8 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -4,35 +4,44 @@
 from PIL import Image
 
 from vllm import LLM, SamplingParams
-from vllm.multimodal.image import ImagePixelData
 
 # Dynamic image input is currently not supported and therefore
 # a fixed image input shape and its corresponding feature size is required.
 # See https://github.com/vllm-project/vllm/pull/4199 for the complete
 # configuration matrix.
 
-llm = LLM(
-    model="llava-hf/llava-v1.6-mistral-7b-hf",
-    image_input_type="pixel_values",
-    image_token_id=32000,
-    image_input_shape="1,3,336,336",
-    image_feature_size=1176,
-)
-
-prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
-url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
-image = Image.open(BytesIO(requests.get(url).content))
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
-
-outputs = llm.generate(
-    {
-        "prompt": prompt,
-        "multi_modal_data": ImagePixelData(image),
-    },
-    sampling_params=sampling_params)
-
-generated_text = ""
-for o in outputs:
-    generated_text += o.outputs[0].text
-
-print(f"LLM output:{generated_text}")
+
+def run_llava_next():
+    llm = LLM(
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=1176,
+    )
+
+    prompt = "[INST] " + "<image>" * 1176 + (
+        "\nWhat is shown in this image? [/INST]")
+    url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
+    image = Image.open(BytesIO(requests.get(url).content))
+    sampling_params = SamplingParams(temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=100)
+
+    outputs = llm.generate(
+        {
+            "prompt": prompt,
+            "multi_modal_data": {
+                "image": image
+            }
+        },
+        sampling_params=sampling_params)
+
+    generated_text = ""
+    for o in outputs:
+        generated_text += o.outputs[0].text
+
+    print(f"LLM output:{generated_text}")
+
+
+if __name__ == "__main__":
+    run_llava_next()

From abd56fc01592a9bfce0789f60f17cb620ec80bdf Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 06:16:35 +0000
Subject: [PATCH 159/181] Update comments

---
 examples/llava_example.py      | 2 +-
 examples/llava_next_example.py | 8 +++-----
 examples/phi3v_example.py      | 5 ++++-
 vllm/multimodal/base.py        | 2 +-
 4 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/llava_example.py b/examples/llava_example.py
index f5cb2a661e83..40ffe9bd3fe2 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -14,7 +14,7 @@ def run_llava():
         model="llava-hf/llava-1.5-7b-hf",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
-        image_feature_size=576,
+        image_feature_size=576,  # The value does not matter
     )
 
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index 269ba72e7eb3..dd3eeed05072 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -5,10 +5,8 @@
 
 from vllm import LLM, SamplingParams
 
-# Dynamic image input is currently not supported and therefore
-# a fixed image input shape and its corresponding feature size is required.
-# See https://github.com/vllm-project/vllm/pull/4199 for the complete
-# configuration matrix.
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
 
 
 def run_llava_next():
@@ -16,7 +14,7 @@ def run_llava_next():
         model="llava-hf/llava-v1.6-mistral-7b-hf",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
-        image_feature_size=1176,
+        image_feature_size=1176,  # The value does not matter
     )
 
     prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index a9a8c499f91d..d91d54117fad 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -5,6 +5,9 @@
 
 from vllm import LLM, SamplingParams
 
+# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
+
 
 def run_phi3v():
     model_path = "microsoft/Phi-3-vision-128k-instruct"
@@ -18,7 +21,7 @@ def run_phi3v():
         trust_remote_code=True,
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
-        image_feature_size=1921,
+        image_feature_size=1921,  # The value does not matter
         max_num_seqs=5,
     )
 
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index c8daf9ea1df2..4962aefa16d8 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -65,7 +65,7 @@ def try_concat(
     ) -> BatchedTensors:
         # Avoid initializing CUDA too early
         import torch
-        
+
         unbatched_shape = tensors[0].shape[1:]
 
         for tensor in tensors:

From 38042ab7bf975c957c254428a0f78d7b48a56154 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 06:31:53 +0000
Subject: [PATCH 160/181] Remove some unnecessary deferred imports

---
 tests/models/test_llava.py      | 7 ++-----
 tests/models/test_llava_next.py | 7 ++-----
 tests/models/test_phi3v.py      | 7 ++-----
 tests/models/utils.py           | 6 ++----
 4 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 9b953457f795..916849b7cf73 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,17 +1,14 @@
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
-if TYPE_CHECKING:
-    # it may call torch.cuda.device_count()
-    from vllm.sequence import SampleLogprobs
-
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 073cba4a1327..097ccf3b2fb5 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,18 +1,15 @@
 import re
-from typing import TYPE_CHECKING, List, Optional, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
 from .utils import check_logprobs_close
 
-if TYPE_CHECKING:
-    # it may call torch.cuda.device_count()
-    from vllm.sequence import SampleLogprobs
-
 pytestmark = pytest.mark.vlm
 
 _PREFACE = (
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 5a6e235b4884..bf66ee2a5b7a 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -1,19 +1,16 @@
 import re
-from typing import TYPE_CHECKING, List, Optional, Tuple, Type
+from typing import List, Optional, Tuple, Type
 
 import pytest
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close
 
-if TYPE_CHECKING:
-    # it may call torch.cuda.device_count()
-    from vllm.sequence import SampleLogprobs
-
 pytestmark = pytest.mark.vlm
 
 HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
diff --git a/tests/models/utils.py b/tests/models/utils.py
index a6c46554edaf..9046e1816de2 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -1,9 +1,7 @@
 import warnings
-from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
-if TYPE_CHECKING:
-    # it may call torch.cuda.device_count()
-    from vllm.sequence import SampleLogprobs
+from vllm.sequence import SampleLogprobs
 
 TokensText = Tuple[List[int], str]
 

From 9a49d2cd7cce6262e7fd3483ac4a26c61729d2e6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:04:48 +0000
Subject: [PATCH 161/181] Use more precise type annotation

---
 vllm/sequence.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/sequence.py b/vllm/sequence.py
index 3e7c31b8c1a8..2105b1690ce4 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -457,7 +457,7 @@ def prompt_token_ids(self) -> List[int]:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional["MultiModalDataDict"]:
+    def multi_modal_data(self) -> "MultiModalDataDict":
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data

From ac6f4fa59dea40deef64d2df9f368a50b104b199 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:13:06 +0000
Subject: [PATCH 162/181] Fix wrong feature size

---
 examples/llava_example.py       | 2 +-
 examples/llava_next_example.py  | 3 ++-
 examples/phi3v_example.py       | 3 ++-
 tests/models/test_llava.py      | 2 +-
 tests/models/test_llava_next.py | 5 +----
 tests/models/test_phi3v.py      | 6 +++---
 tests/models/utils.py           | 2 +-
 7 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/examples/llava_example.py b/examples/llava_example.py
index 40ffe9bd3fe2..f5cb2a661e83 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -14,7 +14,7 @@ def run_llava():
         model="llava-hf/llava-1.5-7b-hf",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
-        image_feature_size=576,  # The value does not matter
+        image_feature_size=576,
     )
 
     prompt = "USER: <image>\nWhat is the content of this image?\nASSISTANT:"
diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index dd3eeed05072..e9e87462900f 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -14,7 +14,8 @@ def run_llava_next():
         model="llava-hf/llava-v1.6-mistral-7b-hf",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
-        image_feature_size=1176,  # The value does not matter
+        # Use the maximum possible value for memory profiling
+        image_feature_size=2928,
     )
 
     prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py
index d91d54117fad..0aabfee6ab63 100644
--- a/examples/phi3v_example.py
+++ b/examples/phi3v_example.py
@@ -21,7 +21,8 @@ def run_phi3v():
         trust_remote_code=True,
         image_token_id=32044,
         image_input_shape="1,3,1008,1344",
-        image_feature_size=1921,  # The value does not matter
+        # Use the maximum possible value for memory profiling
+        image_feature_size=2653,
         max_num_seqs=5,
     )
 
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 916849b7cf73..c0907dc689b1 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -38,7 +38,7 @@ def iter_llava_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional["SampleLogprobs"]],
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 097ccf3b2fb5..4f13fa8592aa 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -29,9 +29,6 @@ def iter_llava_next_configs(model_name: str):
     # Need to use the max possible feature size for profile_run
     image_hw_to_feature_size = {
         (336, 336): 2928,
-        (672, 672): 2928,
-        (1344, 336): 2928,
-        (336, 1344): 2928,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -50,7 +47,7 @@ def iter_llava_next_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional["SampleLogprobs"]],
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index bf66ee2a5b7a..2e71e0cee601 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -22,9 +22,9 @@
 
 
 def iter_phi3v_configs(model_name: str):
+    # Need to use the max possible feature size for profile_run
     image_hw_to_feature_size = {
-        (1008, 1344): 1921,
-        (2016, 2688): 1933,
+        (1008, 1344): 2653,
     }
 
     for (h, w), f in image_hw_to_feature_size.items():
@@ -41,7 +41,7 @@ def iter_phi3v_configs(model_name: str):
 
 
 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
-                                         Optional["SampleLogprobs"]],
+                                         Optional[SampleLogprobs]],
                       vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
diff --git a/tests/models/utils.py b/tests/models/utils.py
index 9046e1816de2..51d57129d9d2 100644
--- a/tests/models/utils.py
+++ b/tests/models/utils.py
@@ -36,7 +36,7 @@ def check_outputs_equal(
 
 TokensTextLogprobs = Tuple[List[int], str, Optional[Union[List[Dict[int,
                                                                     float]],
-                                                          "SampleLogprobs"]]]
+                                                          SampleLogprobs]]]
 
 
 def check_logprobs_close(

From 3f957781a1b52772d9272ab5cab0692af64d20c1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:16:15 +0000
Subject: [PATCH 163/181] Fix wrong image

---
 tests/multimodal/test_mapper.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py
index 7009ccc6ed2c..321566ad53a5 100644
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -34,7 +34,7 @@ def test_clip_image_processor(image_assets, dtype, size_factor):
         )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            {"image": asset.pil_image},
+            {"image": image},
         )
 
         assert hf_result.keys() == vllm_result.keys()
@@ -73,7 +73,7 @@ def test_llava_next_image_processor(image_assets, dtype, size_factor):
         )
         vllm_result = MULTIMODAL_REGISTRY.map_input(
             model_config,
-            {"image": asset.pil_image},
+            {"image": image},
         )
 
         assert hf_result.keys() == vllm_result.keys()

From 90e80c435a07c0587e9e1afbe9d7fc0d1a760226 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:19:14 +0000
Subject: [PATCH 164/181] Remove unnecessary lazy import

---
 vllm/config.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/config.py b/vllm/config.py
index b30f616b78cd..4f5c042a0e8b 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -10,6 +10,7 @@
 import vllm.envs as envs
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+from vllm.model_executor.models import ModelRegistry
 from vllm.tracing import is_otel_installed
 from vllm.transformers_utils.config import get_config, get_hf_text_config
 from vllm.utils import (cuda_device_count_stateless, get_cpu_memory, is_cpu,
@@ -176,9 +177,6 @@ def _verify_tokenizer_mode(self) -> None:
         self.tokenizer_mode = tokenizer_mode
 
     def _verify_embedding_mode(self) -> None:
-        # it may call torch.cuda.device_count()
-        from vllm.model_executor.models import ModelRegistry
-
         architectures = getattr(self.hf_config, "architectures", [])
         self.embedding_mode = any(
             ModelRegistry.is_embedding_model(arch) for arch in architectures)

From ea622c71c76edc040fd7f6404aef72e1422dd4d6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:42:10 +0000
Subject: [PATCH 165/181] Check for conflicting kwargs in `map_input`

---
 vllm/multimodal/registry.py | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index 63164797c9a8..c7ef459fcc56 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,12 +1,16 @@
 import functools
-from typing import Optional, Sequence
+from typing import TYPE_CHECKING, Dict, Optional, Sequence
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
 
-from .base import MultiModalDataDict, MultiModalInputMapper, MultiModalPlugin
+from .base import (MultiModalDataDict, MultiModalInputMapper, MultiModalInputs,
+                   MultiModalPlugin)
 from .image import ImagePlugin
 
+if TYPE_CHECKING:
+    import torch
+
 logger = init_logger(__name__)
 
 
@@ -57,7 +61,7 @@ def register_image_input_mapper(
         return self.register_input_mapper("image", mapper)
 
     def _process_input(self, key: str, value: object,
-                       model_config: ModelConfig):
+                       model_config: ModelConfig) -> MultiModalInputs:
         plugin = self._plugins.get(key)
         if plugin:
             return plugin.map_input(model_config, value)
@@ -89,16 +93,28 @@ def register_image_input(self,
         """
         return self.register_input_mapper("image", mapper)
 
-    def map_input(self, model_config: ModelConfig, data: MultiModalDataDict):
+    def map_input(self, model_config: ModelConfig,
+                  data: MultiModalDataDict) -> MultiModalInputs:
         """
         Apply an input mapper to the data passed to the model.
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
-        result_list = [
-            self._process_input(k, v, model_config) for k, v in data.items()
-        ]
-        return {k: v for d in result_list for k, v in d.items()}
+        merged_dict: Dict[str, "torch.Tensor"] = {}
+
+        for data_key, data_value in data.items():
+            input_dict = self._process_input(data_key, data_value,
+                                             model_config)
+
+            for input_key, input_tensor in input_dict.items():
+                if input_key in merged_dict:
+                    raise ValueError(f"The input mappers (keys={set(data)}) "
+                                     f"resulted in a conflicting keyword "
+                                     f"argument to `forward()`: {input_key}")
+
+                merged_dict[input_key] = input_tensor
+
+        return MultiModalInputs(merged_dict)
 
     def create_input_mapper(self, model_config: ModelConfig):
         """

From 18740c22502ba6ec369cb66f5296ee391ce366a6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:42:20 +0000
Subject: [PATCH 166/181] Avoid unnecessary processing

---
 vllm/entrypoints/openai/serving_chat.py | 2 +-
 vllm/worker/cpu_model_runner.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 609b0b68bacd..57ad7bdd3105 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -272,7 +272,7 @@ async def create_chat_completion(
             "prompt": prompt_text,
             "prompt_token_ids": prompt_ids,
         }
-        if mm_data is not None:
+        if mm_data:
             inputs["multi_modal_data"] = mm_data
 
         is_tracing_enabled = await self.engine.is_tracing_enabled()
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 404a57ab5da6..82b157bf0c79 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -161,7 +161,7 @@ def _prepare_prompt(
             input_positions.extend(list(range(computed_len, seq_len)))
 
             mm_data = seq_group_metadata.multi_modal_data
-            if mm_data is not None:
+            if mm_data:
                 mm_kwargs = self.multi_modal_input_mapper(mm_data)
                 multi_modal_inputs_list.append(mm_kwargs)
 

From a0db2c7a126cd55a912c83a54493697b214da5b3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 08:57:23 +0000
Subject: [PATCH 167/181] Update doc

---
 docs/source/models/vlm.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index c89cf809dd6c..3003bbd9fdee 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -41,6 +41,11 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
     )
 
 .. important::
+    Currently, you have to specify ``image_feature_size`` to support memory profiling.
+    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
+    The calculation of feature size is specific to the model. For more details, please refer to
+    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
+
     We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
 
 
@@ -100,6 +105,11 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
         --chat-template template_llava.jinja
 
 .. important::
+    Currently, you have to specify ``image_feature_size`` to support memory profiling.
+    To avoid OOM during runtime, you should set this to the maximum value supported by the model.
+    The calculation of feature size is specific to the model. For more details, please refer to
+    the function :code:`get_<model_name>_image_feature_size` inside the corresponding model file.
+
     We will remove most of the vision-specific arguments in a future release as they can be inferred from the HuggingFace configuration.
 
 To consume the server, you can use the OpenAI client like in the example below:

From 526a8710ff37198022fa5b1729edd790efff3cc6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 09:25:39 +0000
Subject: [PATCH 168/181] Avoid cuda init

---
 vllm/transformers_utils/image_processor.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 354dcb526395..3d3dd6f5cd85 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,5 +1,4 @@
-from transformers import AutoImageProcessor
-from transformers.image_processing_utils import BaseImageProcessor
+from typing import cast
 
 from vllm.logger import init_logger
 
@@ -11,10 +10,15 @@ def get_image_processor(
     *args,
     trust_remote_code: bool = False,
     **kwargs,
-) -> BaseImageProcessor:
+):
     """Gets an image processor for the given model name via HuggingFace."""
+    # don't put this import at the top level
+    # it will call torch.cuda.device_count()
+    from transformers import AutoImageProcessor
+    from transformers.image_processing_utils import BaseImageProcessor
+
     try:
-        processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
+        processor = AutoImageProcessor.from_pretrained(
             processor_name,
             *args,
             trust_remote_code=trust_remote_code,
@@ -34,4 +38,4 @@ def get_image_processor(
         else:
             raise e
 
-    return processor
+    return cast(BaseImageProcessor, processor)

From a5174da6c1312d4af27099bd7e1bb4d6521445c3 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 09:26:14 +0000
Subject: [PATCH 169/181] Remove unused logger

---
 vllm/transformers_utils/image_processor.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
index 3d3dd6f5cd85..c7d9eabd06f0 100644
--- a/vllm/transformers_utils/image_processor.py
+++ b/vllm/transformers_utils/image_processor.py
@@ -1,9 +1,5 @@
 from typing import cast
 
-from vllm.logger import init_logger
-
-logger = init_logger(__name__)
-
 
 def get_image_processor(
     processor_name: str,

From 6cf34e4177e4fe3272afdb04cd0db3b0f0167453 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 10:34:48 +0000
Subject: [PATCH 170/181] Remove unnecessary deferred imports

---
 tests/models/test_llava.py      |  5 +----
 tests/models/test_llava_next.py |  5 +----
 tests/models/test_phi3v.py      |  5 +----
 vllm/inputs/registry.py         |  9 ++++-----
 vllm/multimodal/base.py         | 29 +++++++++++++----------------
 vllm/multimodal/registry.py     |  9 ++++-----
 6 files changed, 24 insertions(+), 38 deletions(-)

diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index c0907dc689b1..dbd8d9ce207a 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -4,6 +4,7 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
@@ -89,10 +90,6 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from vllm.multimodal.utils import rescale_image_size
-
     model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index 4f13fa8592aa..f7e7e41b73ec 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -5,6 +5,7 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 
 from ..conftest import IMAGE_ASSETS
@@ -104,10 +105,6 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from vllm.multimodal.utils import rescale_image_size
-
     model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index 2e71e0cee601..f797094aa2a5 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -5,6 +5,7 @@
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
+from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs
 from vllm.utils import is_cpu
 
@@ -92,10 +93,6 @@ def run_test(
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    # don't put this import at the top level
-    # it will call torch.cuda.device_count()
-    from vllm.multimodal.utils import rescale_image_size
-
     model_id, vlm_config = model_and_config
     images = [asset.pil_image for asset in image_assets]
 
diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py
index 86680136a5f0..936909eb33f6 100644
--- a/vllm/inputs/registry.py
+++ b/vllm/inputs/registry.py
@@ -3,6 +3,7 @@
 from typing import (TYPE_CHECKING, Callable, Dict, Optional, Tuple, Type,
                     TypeVar)
 
+from torch import nn
 from transformers import PretrainedConfig
 
 from vllm.logger import init_logger
@@ -10,8 +11,6 @@
 from .data import LLMInputs
 
 if TYPE_CHECKING:
-    from torch import nn
-
     from vllm.config import ModelConfig, VisionLanguageConfig
     from vllm.multimodal import MultiModalDataDict
     from vllm.sequence import SequenceData
@@ -64,7 +63,7 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C:
         return hf_config
 
 
-N = TypeVar("N", bound=Type["nn.Module"])
+N = TypeVar("N", bound=Type[nn.Module])
 
 DummyDataFactory = Callable[[InputContext, int],
                             Tuple["SequenceData",
@@ -87,9 +86,9 @@ class InputRegistry:
     """
 
     def __init__(self) -> None:
-        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
+        self._dummy_factories_by_model_type: Dict[Type[nn.Module],
                                                   DummyDataFactory] = {}
-        self._input_processors_by_model_type: Dict[Type["nn.Module"],
+        self._input_processors_by_model_type: Dict[Type[nn.Module],
                                                    InputProcessor] = {}
 
     def _default_dummy_data_factory(
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 4962aefa16d8..66b0736870c7 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -1,20 +1,18 @@
 import sys
 from abc import ABC, abstractmethod
 from collections import UserDict, defaultdict
-from typing import (TYPE_CHECKING, Any, Callable, Dict, List, Optional, Type,
-                    TypedDict, TypeVar, Union)
+from typing import (Any, Callable, Dict, List, Optional, Type, TypedDict,
+                    TypeVar, Union)
 
+import torch
+import torch.types
 from PIL import Image
+from torch import nn
 
 from vllm.config import ModelConfig
 from vllm.inputs import InputContext
 from vllm.logger import init_logger
 
-if TYPE_CHECKING:
-    import torch
-    import torch.types
-    from torch import nn
-
 logger = init_logger(__name__)
 
 
@@ -35,7 +33,7 @@ class MultiModalData:
     pass
 
 
-BatchedTensors = Union["torch.Tensor", List["torch.Tensor"]]
+BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
 """
 If each input tensor in the batch has the same size, this is a single batched
 tensor; otherwise, this is a list of tensors with one element per batch.
@@ -47,7 +45,7 @@ class _MultiModalInputsBase(UserDict):
         pass
 else:
 
-    class _MultiModalInputsBase(UserDict[str, "torch.Tensor"]):
+    class _MultiModalInputsBase(UserDict[str, torch.Tensor]):
         pass
 
 
@@ -59,9 +57,9 @@ class MultiModalInputs(_MultiModalInputsBase):
 
     @staticmethod
     def try_concat(
-        tensors: List["torch.Tensor"],
+        tensors: List[torch.Tensor],
         *,
-        device: "torch.types.Device",
+        device: torch.types.Device,
     ) -> BatchedTensors:
         # Avoid initializing CUDA too early
         import torch
@@ -79,7 +77,7 @@ def try_concat(
     @staticmethod
     def batch(
         inputs_list: List["MultiModalInputs"],
-        device: "torch.types.Device",
+        device: torch.types.Device,
     ) -> Dict[str, BatchedTensors]:
         """Batch multiple inputs together into a dictionary."""
         if len(inputs_list) == 0:
@@ -87,7 +85,7 @@ def batch(
 
         keys = inputs_list[0].keys()
 
-        item_lists: Dict[str, List["torch.Tensor"]] = defaultdict(list)
+        item_lists: Dict[str, List[torch.Tensor]] = defaultdict(list)
 
         for inputs in inputs_list:
             if inputs.keys() != keys:
@@ -121,7 +119,7 @@ class MultiModalDataBuiltins(TypedDict, total=False):
 :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers
 and processors in HuggingFace Transformers."""
 
-N = TypeVar("N", bound=Type["nn.Module"])
+N = TypeVar("N", bound=Type[nn.Module])
 
 
 class MultiModalPlugin(ABC):
@@ -136,8 +134,7 @@ class MultiModalPlugin(ABC):
     """
 
     def __init__(self) -> None:
-        self._input_mappers: Dict[Type["nn.Module"],
-                                  MultiModalInputMapper] = {}
+        self._input_mappers: Dict[Type[nn.Module], MultiModalInputMapper] = {}
 
     @abstractmethod
     def get_data_key(self) -> str:
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
index c7ef459fcc56..f17b04149ede 100644
--- a/vllm/multimodal/registry.py
+++ b/vllm/multimodal/registry.py
@@ -1,5 +1,7 @@
 import functools
-from typing import TYPE_CHECKING, Dict, Optional, Sequence
+from typing import Dict, Optional, Sequence
+
+import torch
 
 from vllm.config import ModelConfig
 from vllm.logger import init_logger
@@ -8,9 +10,6 @@
                    MultiModalPlugin)
 from .image import ImagePlugin
 
-if TYPE_CHECKING:
-    import torch
-
 logger = init_logger(__name__)
 
 
@@ -100,7 +99,7 @@ def map_input(self, model_config: ModelConfig,
         
         See :meth:`MultiModalPlugin.map_input` for more details.
         """
-        merged_dict: Dict[str, "torch.Tensor"] = {}
+        merged_dict: Dict[str, torch.Tensor] = {}
 
         for data_key, data_value in data.items():
             input_dict = self._process_input(data_key, data_value,

From aacb5d0eee15cd3f2e61f943496aa906e66221a1 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 11:13:41 +0000
Subject: [PATCH 171/181] Fix typo

---
 docs/source/models/vlm.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 3003bbd9fdee..f8c61018a08d 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -153,4 +153,4 @@ A full code example can be found in `examples/openai_vision_api_client.py <https
         export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
 
 .. note::
-    There is no need to format the prompt in the API request when since it will be handled by the server.
+    There is no need to format the prompt in the API request since it will be handled by the server.

From 13f43bdf4ca79ca8c21346b8f9ff4359f0c85141 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 17:36:29 +0000
Subject: [PATCH 172/181] Address comments

---
 examples/llava_next_example.py |  3 ---
 vllm/multimodal/base.py        | 18 ------------------
 2 files changed, 21 deletions(-)

diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py
index e9e87462900f..20d4791ffaf9 100644
--- a/examples/llava_next_example.py
+++ b/examples/llava_next_example.py
@@ -5,9 +5,6 @@
 
 from vllm import LLM, SamplingParams
 
-# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
-# You can use `.buildkite/download-images.sh` to download them
-
 
 def run_llava_next():
     llm = LLM(
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
index 66b0736870c7..e7b45649d728 100644
--- a/vllm/multimodal/base.py
+++ b/vllm/multimodal/base.py
@@ -15,24 +15,6 @@
 
 logger = init_logger(__name__)
 
-
-class MultiModalData:
-    """
-    Base class that contains multi-modal data.
-
-    To add a new modality, add a new file under ``multimodal`` directory.
-
-    In this new file, subclass :class:`~MultiModalData` and
-    :class:`~MultiModalPlugin`.
-
-    Finally, register the new plugin to
-    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
-    This enables models to call :meth:`MultiModalRegistry.map_input` for
-    the new modality.
-    """
-    pass
-
-
 BatchedTensors = Union[torch.Tensor, List[torch.Tensor]]
 """
 If each input tensor in the batch has the same size, this is a single batched

From 00e9e39e87022e4ba908ae68018b1a5e7ee93e5b Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Tue, 2 Jul 2024 17:38:26 +0000
Subject: [PATCH 173/181] Add comment

---
 vllm/multimodal/image.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index 348ee96d4866..dfef33121cbf 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -65,6 +65,7 @@ def repeat_and_pad_image_tokens(
             ))
 
         image_token_count = prompt.count(image_token_str)
+        # This is an arbitrary number to distinguish between the two cases
         if image_token_count > 16:
             logger.warning(
                 "Please follow the prompt format that is "

From a231eaffe8752be6d7b3f4fa9cab69900833e2b9 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 00:18:45 +0000
Subject: [PATCH 174/181] Update XPU runner's multimodal logic

---
 vllm/worker/cpu_model_runner.py |  7 ++--
 vllm/worker/xpu_model_runner.py | 69 +++++++++++++++++++++------------
 2 files changed, 48 insertions(+), 28 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index b6378dcf176a..982782355899 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -188,9 +188,6 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
-                                                    device=self.device)
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -214,6 +211,10 @@ def _prepare_prompt(
             block_tables=torch.tensor([]),
             slot_mapping=slot_mapping,
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
 
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index e652f1b1042e..47a48b8d1c62 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Type, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Type, Union)
 
 import torch
 import torch.nn as nn
@@ -9,8 +10,11 @@
                          ModelConfig, ParallelConfig, SchedulerConfig,
                          VisionLanguageConfig)
 from vllm.distributed import broadcast_tensor_dict
+from vllm.inputs import INPUT_REGISTRY
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
                            SequenceGroupMetadata)
@@ -44,7 +48,7 @@ class ModelInputForXPU(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     attn_metadata: Optional["AttentionMetadata"] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
-    multi_modal_input: Optional[Dict[str, torch.Tensor]] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -116,6 +120,10 @@ def __init__(
             self.block_size,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -156,12 +164,26 @@ def profile_run(self) -> None:
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
+        model_config = self.model_config
+        vlm_config = self.vision_language_config
+
+        if vlm_config:
+            max_num_seqs = min(
+                max_num_seqs,
+                int(max_num_batched_tokens / vlm_config.image_feature_size))
+
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
 
-            seq_data = SequenceData([0] * seq_len)
-            dummy_multi_modal_data = None
+            seq_data, dummy_multi_modal_data = INPUT_REGISTRY \
+                .dummy_data_for_profiling(model_config, seq_len)
+
+            # Having more tokens is over-conservative but otherwise fine
+            assert len(seq_data.prompt_token_ids) >= seq_len, (
+                f"Expected at least {seq_len} dummy tokens for profiling, "
+                f"but got: {len(seq_data.prompt_token_ids)}")
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -194,7 +216,7 @@ def prepare_model_input(
             virtual_engine: int = 0,
             finished_requests_ids: Optional[List[str]] = None
     ) -> ModelInputForXPU:
-        multi_modal_input = None
+        multi_modal_kwargs = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
             # all decodes.
@@ -202,7 +224,7 @@ def prepare_model_input(
             # Prepare input tensors.
             if is_prompt:
                 (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_input
+                 multi_modal_kwargs
                  ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
@@ -232,6 +254,7 @@ def prepare_model_input(
             input_positions = metadata_dict.pop("input_positions")
             selected_token_indices = metadata_dict.pop(
                 "selected_token_indices")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             attn_metadata = self.attn_backend.make_metadata(**metadata_dict)
             sampling_metadata = SamplingMetadata(
                 seq_groups=None,
@@ -244,7 +267,7 @@ def prepare_model_input(
                                 input_positions=input_positions,
                                 attn_metadata=attn_metadata,
                                 sampling_metadata=sampling_metadata,
-                                multi_modal_input=multi_modal_input)
+                                multi_modal_kwargs=multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -351,9 +374,9 @@ def execute_model(
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update(
-                {"image_input": model_input.multi_modal_input})
+        if (self.vision_language_config
+                and model_input.multi_modal_kwargs is not None):
+            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
@@ -376,13 +399,13 @@ def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Optional[torch.Tensor]]:
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -403,9 +426,10 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, seq_len)))
 
-            if seq_group_metadata.multi_modal_data:
-                multi_modal_input_list.append(
-                    seq_group_metadata.multi_modal_data.data)
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -435,15 +459,6 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
         num_prompt_tokens = len(input_tokens)
 
         input_tokens = torch.tensor(input_tokens,
@@ -475,5 +490,9 @@ def _prepare_prompt(
             num_decode_tokens=0,
             block_tables=torch.tensor([], device=self.device, dtype=torch.int),
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_input)
+                multi_modal_kwargs)

From ec74121c2ca078d7f915e7415f286b68c705c776 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 00:46:14 +0000
Subject: [PATCH 175/181] Fix unused import

---
 vllm/worker/xpu_model_runner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 47a48b8d1c62..132c97bd79e5 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -16,7 +16,7 @@
 from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
                              MultiModalInputs)
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (IntermediateTensors, SamplerOutput, SequenceData,
+from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata

From d16d3c894a2d21b3a778bb9da09865ece823d6bb Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 00:49:07 +0000
Subject: [PATCH 176/181] Fix feature size calculation

---
 vllm/model_executor/models/llava_next.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index 749b59a15f6d..92604cdf3760 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -59,6 +59,8 @@ class LlavaNextImagePixelInputs(TypedDict):
 
 
 # Taken from: https://github.com/huggingface/text-generation-inference/blob/v2.0.4/server/text_generation_server/models/vlm_causal_lm.py#L91
+# NOTE: new_height and new_width are further incremented to properly invert the
+# floordiv operation: https://github.com/huggingface/transformers/blob/v4.42.2/src/transformers/models/llava_next/modeling_llava_next.py#L133
 def _get_llava_next_num_unpadded_features(
     height: int,
     width: int,
@@ -73,9 +75,13 @@ def _get_llava_next_num_unpadded_features(
     current_aspect_ratio: float = current_width / current_height
     if aspect_ratio > current_aspect_ratio:
         new_height = (height * current_width) // width
+        if new_height % 2 == 1:
+            new_height += 1
         current_height = new_height
     else:
         new_width = (width * current_height) // height
+        if new_width % 2 == 1:
+            new_width += 1
         current_width = new_width
 
     unpadded_features = current_height * current_width
@@ -128,7 +134,10 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int):
     # Result in the max possible feature size (2x2 grid of 336x336px tiles)
     dummy_height = dummy_width = 448
     image_feature_size = get_llava_next_image_feature_size(
-        hf_config, input_height=dummy_height, input_width=dummy_width)
+        hf_config,
+        input_height=dummy_height,
+        input_width=dummy_width,
+    )
 
     if isinstance(vision_config, CLIPVisionConfig):
         seq_data = dummy_seq_data_for_clip(
@@ -164,7 +173,10 @@ def input_processor_for_llava_next(ctx: InputContext, llm_inputs: LLMInputs):
         width, height = image_data.size
 
         image_feature_size = get_llava_next_image_feature_size(
-            hf_config, input_height=height, input_width=width)
+            hf_config,
+            input_height=height,
+            input_width=width,
+        )
     elif isinstance(image_data, torch.Tensor):
         raise NotImplementedError("Embeddings input is not supported yet")
     else:
@@ -277,10 +289,10 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
             strategy=self.config.vision_feature_select_strategy,
         )
 
+    # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
     def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                                       patch_embeddings: torch.Tensor, *,
                                       strategy: str) -> torch.Tensor:
-        # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
         if strategy == "flat":
             return patch_embeddings.flatten(0, 1)
 

From aaa0f1ff6f7380659ee761bebce452c415a7fe24 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 01:30:32 +0000
Subject: [PATCH 177/181] Add extra image to test

---
 tests/conftest.py               | 22 +++++++++---
 tests/models/test_llava.py      |  2 ++
 tests/models/test_llava_next.py |  2 ++
 tests/models/test_phi3v.py      |  2 ++
 tests/multimodal/test_utils.py  | 29 +++++++++------
 vllm/multimodal/utils.py        | 64 ++++++++++++++++++++++++++-------
 6 files changed, 92 insertions(+), 29 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 47d31dd718fc..608a5f49d593 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -23,6 +23,7 @@
                               destroy_model_parallel)
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
+from vllm.multimodal.utils import fetch_image
 from vllm.sequence import SampleLogprobs
 from vllm.utils import cuda_device_count_stateless, is_cpu
 
@@ -44,16 +45,22 @@ def _read_prompts(filename: str) -> List[str]:
 
 @dataclass(frozen=True)
 class ImageAsset:
-    name: Literal["stop_sign", "cherry_blossom"]
+    name: Literal["stop_sign", "cherry_blossom", "boardwalk"]
 
     @cached_property
     def pil_image(self) -> Image.Image:
+        if self.name == "boardwalk":
+            return fetch_image(
+                "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+            )
+
         return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
 
 
 class _ImageAssetPrompts(TypedDict):
     stop_sign: str
     cherry_blossom: str
+    boardwalk: str
 
 
 if sys.version_info < (3, 9):
@@ -69,9 +76,11 @@ class _ImageAssetsBase(UserList[ImageAsset]):
 class _ImageAssets(_ImageAssetsBase):
 
     def __init__(self) -> None:
-        super().__init__(
-            [ImageAsset("stop_sign"),
-             ImageAsset("cherry_blossom")])
+        super().__init__([
+            ImageAsset("stop_sign"),
+            ImageAsset("cherry_blossom"),
+            ImageAsset("boardwalk")
+        ])
 
     def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         """
@@ -80,7 +89,10 @@ def prompts(self, prompts: _ImageAssetPrompts) -> List[str]:
         The order of the returned prompts matches the order of the
         assets when iterating through this object.
         """
-        return [prompts["stop_sign"], prompts["cherry_blossom"]]
+        return [
+            prompts["stop_sign"], prompts["cherry_blossom"],
+            prompts["boardwalk"]
+        ]
 
 
 IMAGE_ASSETS = _ImageAssets()
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index dbd8d9ce207a..2f4b85bc1617 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -17,6 +17,8 @@
     "USER: <image>\nWhat's the content of the image?\nASSISTANT:",
     "cherry_blossom":
     "USER: <image>\nWhat is the season?\nASSISTANT:",
+    "boardwalk":
+    "USER: <image>\nWhat's in this image?\nASSISTANT:",
 })
 
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
index f7e7e41b73ec..8817f41a62f7 100644
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -23,6 +23,8 @@
     f"{_PREFACE} USER: <image>\nWhat's the content of the image? ASSISTANT:",
     "cherry_blossom":
     f"{_PREFACE} USER: <image>\nWhat is the season? ASSISTANT:",
+    "boardwalk":
+    f"{_PREFACE} USER: <image>\nWhat's in this image? ASSISTANT:",
 })
 
 
diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py
index f797094aa2a5..f144f97551c0 100644
--- a/tests/models/test_phi3v.py
+++ b/tests/models/test_phi3v.py
@@ -19,6 +19,8 @@
     "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n",  # noqa: E501
     "cherry_blossom":
     "<|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n",
+    "boardwalk":
+    "<|user|>\n<|image_1|>\nWhat's in this image?<|end|>\n<|assistant|>\n",
 })
 
 
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 5a6395ac9e42..10cabdadb1dc 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -5,10 +5,9 @@
 
 import numpy as np
 import pytest
-import pytest_asyncio
 from PIL import Image
 
-from vllm.multimodal.utils import ImageFetchAiohttp
+from vllm.multimodal.utils import ImageFetchAiohttp, fetch_image
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
@@ -19,12 +18,9 @@
 ]
 
 
-@pytest_asyncio.fixture(scope="session")
-async def url_images() -> Dict[str, Image.Image]:
-    return {
-        image_url: await ImageFetchAiohttp.fetch_image(image_url)
-        for image_url in TEST_IMAGE_URLS
-    }
+@pytest.fixture(scope="module")
+def url_images() -> Dict[str, Image.Image]:
+    return {image_url: fetch_image(image_url) for image_url in TEST_IMAGE_URLS}
 
 
 def get_supported_suffixes() -> Tuple[str, ...]:
@@ -41,7 +37,15 @@ def _image_equals(a: Image.Image, b: Image.Image) -> bool:
     return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
 
 
-@pytest.mark.asyncio
+@pytest.mark.asyncio(scope="module")
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_fetch_image_http(image_url: str):
+    image_sync = fetch_image(image_url)
+    image_async = await ImageFetchAiohttp.fetch_image(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
+@pytest.mark.asyncio(scope="module")
 @pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
 @pytest.mark.parametrize("suffix", get_supported_suffixes())
 async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
@@ -68,8 +72,11 @@ async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
         base64_image = base64.b64encode(f.read()).decode("utf-8")
         data_url = f"data:{mime_type};base64,{base64_image}"
 
-        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        data_image_sync = fetch_image(data_url)
         if _image_equals(url_image, Image.open(f)):
-            assert _image_equals(url_image, data_image)
+            assert _image_equals(url_image, data_image_sync)
         else:
             pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await ImageFetchAiohttp.fetch_image(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 4080f9ed54ff..e55b8bbfdeaa 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -4,10 +4,56 @@
 from urllib.parse import urlparse
 
 import aiohttp
+import requests
 from PIL import Image
 
 from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
 from vllm.multimodal.base import MultiModalDataDict
+from vllm.version import __version__ as VLLM_VERSION
+
+
+def _validate_remote_url(url: str, *, name: str):
+    parsed_url = urlparse(url)
+    if parsed_url.scheme not in ["http", "https"]:
+        raise ValueError(f"Invalid '{name}': A valid '{name}' "
+                         "must have scheme 'http' or 'https'.")
+
+
+def _get_request_headers():
+    return {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+
+
+def _load_image_from_bytes(b: bytes):
+    image = Image.open(BytesIO(b))
+    image.load()
+    return image
+
+
+def _load_image_from_data_url(image_url: str):
+    # Only split once and assume the second part is the base64 encoded image
+    _, image_base64 = image_url.split(",", 1)
+    return load_image_from_base64(image_base64)
+
+
+def fetch_image(image_url: str) -> Image.Image:
+    """Load PIL image from a url or base64 encoded openai GPT4V format"""
+    if image_url.startswith('http'):
+        _validate_remote_url(image_url, name="image_url")
+
+        headers = _get_request_headers()
+
+        with requests.get(url=image_url, headers=headers) as response:
+            response.raise_for_status()
+            image_raw = response.content
+        image = _load_image_from_bytes(image_raw)
+
+    elif image_url.startswith('data:image'):
+        image = _load_image_from_data_url(image_url)
+    else:
+        raise ValueError("Invalid 'image_url': A valid 'image_url' must start "
+                         "with either 'data:image' or 'http'.")
+
+    return image
 
 
 class ImageFetchAiohttp:
@@ -28,31 +74,23 @@ async def fetch_image(cls, image_url: str) -> Image.Image:
         """Load PIL image from a url or base64 encoded openai GPT4V format"""
 
         if image_url.startswith('http'):
-            parsed_url = urlparse(image_url)
-            if parsed_url.scheme not in ["http", "https"]:
-                raise ValueError("Invalid 'image_url': A valid 'image_url' "
-                                 "must have scheme 'http' or 'https'.")
-            # Avoid circular import
-            from vllm import __version__ as VLLM_VERSION
+            _validate_remote_url(image_url, name="image_url")
 
             client = cls.get_aiohttp_client()
-            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+            headers = _get_request_headers()
 
             async with client.get(url=image_url, headers=headers) as response:
                 response.raise_for_status()
                 image_raw = await response.read()
-            image = Image.open(BytesIO(image_raw))
+            image = _load_image_from_bytes(image_raw)
 
-        # Only split once and assume the second part is the base64 encoded image
         elif image_url.startswith('data:image'):
-            image = load_image_from_base64(image_url.split(',', 1)[1])
-
+            image = _load_image_from_data_url(image_url)
         else:
             raise ValueError(
                 "Invalid 'image_url': A valid 'image_url' must start "
                 "with either 'data:image' or 'http'.")
 
-        image.load()
         return image
 
 
@@ -73,7 +111,7 @@ def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
 
 def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
     """Load image from base64 format."""
-    return Image.open(BytesIO(base64.b64decode(image)))
+    return _load_image_from_bytes(base64.b64decode(image))
 
 
 def rescale_image_size(image: Image.Image, size_factor: float) -> Image.Image:

From cc540c3e2b6069587f2afe70c0e1c55f3a2a8fd6 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 01:49:01 +0000
Subject: [PATCH 178/181] Support multimodal data for neuron and tpu

---
 vllm/worker/neuron_model_runner.py | 37 ++++++++++++++++++++----
 vllm/worker/tpu_model_runner.py    | 45 ++++++++++++++++++++++++++----
 2 files changed, 71 insertions(+), 11 deletions(-)

diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index 8b96966be470..a95468110184 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -1,5 +1,6 @@
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple,
+                    Union)
 
 import torch
 from torch import nn
@@ -9,6 +10,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.neuron import get_neuron_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import (IntermediateTensors, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.utils import is_pin_memory_available, make_tensor_with_pad
@@ -29,6 +32,7 @@ class ModelInputForNeuron(ModelRunnerInputBase):
     input_positions: Optional[torch.Tensor] = None
     input_block_ids: Optional[torch.Tensor] = None
     sampling_metadata: Optional["SamplingMetadata"] = None
+    multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]] = None
 
     def as_broadcastable_tensor_dict(
             self) -> Dict[str, Union[int, torch.Tensor]]:
@@ -65,6 +69,10 @@ def __init__(
         self.device = self.device_config.device
         self.pin_memory = is_pin_memory_available()
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # initialize after load_model.
 
@@ -76,13 +84,15 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, List[int], Mapping[
+            str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         input_block_ids: List[int] = []
 
         seq_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
             seq_ids = list(seq_group_metadata.seq_data.keys())
@@ -102,6 +112,12 @@ def _prepare_prompt(
             assert len(block_table) == 1
             input_block_ids.append(block_table[0])
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                # Process multi-modal data
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         max_seq_len = max(seq_lens)
         assert max_seq_len > 0
         input_tokens = make_tensor_with_pad(input_tokens,
@@ -118,7 +134,11 @@ def _prepare_prompt(
                                        dtype=torch.long,
                                        device=self.device)
 
-        return input_tokens, input_positions, input_block_ids, seq_lens
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, input_block_ids, seq_lens,
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -184,8 +204,9 @@ def prepare_model_input(
         is_prompt = seq_group_metadata_list[0].is_prompt
         # Prepare input tensors.
         if is_prompt:
-            (input_tokens, input_positions, input_block_ids,
-             seq_lens) = self._prepare_prompt(seq_group_metadata_list)
+            (input_tokens, input_positions, input_block_ids, seq_lens,
+             multi_modal_kwargs
+             ) = self._prepare_prompt(seq_group_metadata_list)
         else:
             (input_tokens, input_positions,
              input_block_ids) = self._prepare_decode(seq_group_metadata_list)
@@ -203,7 +224,8 @@ def prepare_model_input(
         return ModelInputForNeuron(input_tokens=input_tokens,
                                    input_positions=input_positions,
                                    input_block_ids=input_block_ids,
-                                   sampling_metadata=sampling_metadata)
+                                   sampling_metadata=sampling_metadata,
+                                   multi_modal_kwargs=multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
@@ -217,10 +239,13 @@ def execute_model(
             raise ValueError(
                 "NeuronModelRunner does not support multi-step execution.")
 
+        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
+
         hidden_states = self.model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
+            **multi_modal_kwargs,
         )
 
         # Compute the logits.
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index dd08536efc5f..29928e3a8da0 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -1,5 +1,5 @@
 import time
-from typing import List, Optional, Tuple
+from typing import List, Mapping, Optional, Tuple
 
 import numpy as np
 import torch
@@ -12,6 +12,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import (CompletionSequenceGroupOutput, Logprob,
                            SamplerOutput, SequenceGroupMetadata,
                            SequenceOutput)
@@ -66,6 +68,10 @@ def __init__(
             False,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
     def load_model(self) -> None:
         self.device = self.device_config.device
 
@@ -193,12 +199,14 @@ def warmup_model(
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor,
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         prompt_lens: List[int] = []
         slot_mapping: List[List[int]] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -224,6 +232,11 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping[-1].append(slot)
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         assert len(prompt_lens) > 0
         num_prefills = len(prompt_lens)
         num_prefill_tokens = sum(prompt_lens)
@@ -261,17 +274,24 @@ def _prepare_prompt(
             block_tables=None,
             context_lens=None,
         )
-        return input_tokens, input_positions, attn_metadata, prompt_lens
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, attn_metadata, prompt_lens,
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ):
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, torch.Tensor,
+               Mapping[str, BatchedTensors]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[List[int]] = []
         input_positions: List[List[int]] = []
         slot_mapping: List[List[int]] = []
         context_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
 
         batch_idx = 0
         for seq_group_metadata in seq_group_metadata_list:
@@ -297,6 +317,11 @@ def _prepare_decode(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append([slot])
 
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+
         batch_size = _get_padded_batch_size(batch_idx)
         num_paddings = batch_size - batch_idx
         input_tokens = input_tokens + [[0]] * num_paddings
@@ -330,7 +355,12 @@ def _prepare_decode(
             block_tables=block_tables,
             context_lens=context_lens,
         )
-        return input_tokens, input_positions, attn_metadata, input_lens
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
+        return (input_tokens, input_positions, attn_metadata, input_lens,
+                multi_modal_kwargs)
 
     def _prepare_sample(
         self,
@@ -483,6 +513,7 @@ def forward(
         kv_caches: List[Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
         attn_metadata: AttentionMetadata,
         input_lens: torch.Tensor,
+        multi_modal_kwargs: Optional[Mapping[str, BatchedTensors]],
         t: torch.Tensor,
         p: torch.Tensor,
         num_samples: int,
@@ -496,6 +527,8 @@ def forward(
                 memory profiling at initialization.
             attn_metadata: The Pallas attention metadata.
             input_lens: The actual input lengths of shape [batch_size].
+            multi_modal_kwargs: Keyword arguments from multi-modal data to
+                pass to the model.
             t: The sampling temperature of shape [batch_size].
             p: The top-p probability of shape [batch_size].
         """
@@ -535,11 +568,13 @@ def forward(
             slot_mapping = slot_mapping.flatten()
             attn_metadata.slot_mapping = slot_mapping
 
+        multi_modal_kwargs = multi_modal_kwargs or {}
         hidden_states = self.model(
             token_ids,
             position_ids,
             kv_caches,
             attn_metadata,
+            **multi_modal_kwargs,
         )
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)

From 48489ef80103e6639352eb032dd62d63bc0c0640 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 01:49:25 +0000
Subject: [PATCH 179/181] Fix broadcasting

---
 vllm/worker/xpu_model_runner.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 132c97bd79e5..568fa8791588 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -245,6 +245,7 @@ def prepare_model_input(
                 "input_positions": input_positions,
                 "selected_token_indices":
                 sampling_metadata.selected_token_indices,
+                "multi_modal_kwargs": multi_modal_kwargs,
             }
             metadata_dict.update(attn_metadata.asdict_zerocopy())
             broadcast_tensor_dict(metadata_dict, src=0)

From 2adc41fd101c9bbfdc7c44e48411d2a501b1f536 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 02:02:33 +0000
Subject: [PATCH 180/181] Fix OpenVINO model runner for multimodal data

---
 vllm/worker/openvino_model_runner.py | 38 +++++++++++++++++++---------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/vllm/worker/openvino_model_runner.py b/vllm/worker/openvino_model_runner.py
index 336eaf814fb3..f064048888a7 100644
--- a/vllm/worker/openvino_model_runner.py
+++ b/vllm/worker/openvino_model_runner.py
@@ -1,4 +1,4 @@
-from typing import List, NamedTuple, Optional, Tuple
+from typing import List, Mapping, NamedTuple, Optional, Tuple
 
 import openvino as ov
 import torch
@@ -12,6 +12,8 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader.openvino import get_model
+from vllm.multimodal import (MULTIMODAL_REGISTRY, BatchedTensors,
+                             MultiModalInputs)
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 
 logger = init_logger(__name__)
@@ -23,7 +25,7 @@ class ModelInput(NamedTuple):
     attn_metadata: Optional[OpenVINOAttentionMetadata]
     seq_lens: List[int]
     query_lens: List[int]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Mapping[str, BatchedTensors]
 
     @classmethod
     def empty(cls, device):
@@ -32,7 +34,7 @@ def empty(cls, device):
                           attn_metadata=None,
                           seq_lens=[],
                           query_lens=[],
-                          multi_modal_input=None)
+                          multi_modal_kwargs={})
 
 
 class OpenVINOModelRunner:
@@ -78,6 +80,10 @@ def __init__(
             self.block_size,
         )
 
+        # Multi-modal data support
+        self.multi_modal_input_mapper = MULTIMODAL_REGISTRY \
+            .create_input_mapper(self.model_config)
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -108,6 +114,8 @@ def _prepare_model_input(
         seq_lens: List[int] = []
         past_lens: List[int] = []
         query_lens: List[int] = []
+        multi_modal_inputs_list: List[MultiModalInputs] = []
+
         subsequence_begins: List[int] = []
         block_indices: List[int] = []
         block_indices_begins: List[int] = []
@@ -160,6 +168,11 @@ def _prepare_model_input(
                                     and self.sliding_window is None
                                     and is_prompt)
 
+                mm_data = seq_group_metadata.multi_modal_data
+                if mm_data:
+                    mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                    multi_modal_inputs_list.append(mm_kwargs)
+
                 block_table = seq_group_metadata.block_tables[seq_id]
                 # TODO(sang): Combine chunked prefill and prefix caching by
                 # only allowing multiple of block_size chunk size.
@@ -251,22 +264,24 @@ def _prepare_model_input(
             block_indices_begins=block_indices_begins_tensor,
             max_context_len=max_context_len_tensor,
         )
+
+        multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list,
+                                                    device=self.device)
+
         return ModelInput(
             input_tokens,
             input_positions,
             attn_metadata,
             seq_lens,
             query_lens,
-            None,
+            multi_modal_kwargs=multi_modal_kwargs,
         )
 
     def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, OpenVINOAttentionMetadata,
-               SamplingMetadata, Optional[torch.Tensor], ]:
-        multi_modal_input = None
-
+               SamplingMetadata, Mapping[str, BatchedTensors]]:
         # Prepare input tensors.
         (
             input_tokens,
@@ -274,7 +289,7 @@ def prepare_input_tensors(
             attn_metadata,
             seq_lens,
             query_lens,
-            multi_modal_input,
+            multi_modal_kwargs,
         ) = self._prepare_model_input(seq_group_metadata_list)
 
         sampling_metadata = SamplingMetadata.prepare(
@@ -290,7 +305,7 @@ def prepare_input_tensors(
             input_positions,
             attn_metadata,
             sampling_metadata,
-            multi_modal_input,
+            multi_modal_kwargs,
         )
 
     @torch.inference_mode()
@@ -304,7 +319,7 @@ def execute_model(
             input_positions,
             attn_metadata,
             sampling_metadata,
-            multi_modal_input,
+            multi_modal_kwargs,
         ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         model_executable = self.model
@@ -313,9 +328,8 @@ def execute_model(
             "positions": input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": attn_metadata,
+            **(multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
 
         hidden_states = model_executable(**execute_model_kwargs)
 

From 0e6845f7cf6597083dbe3bb45c2b0a3b4b2c75ac Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Wed, 3 Jul 2024 02:02:39 +0000
Subject: [PATCH 181/181] Cleanup

---
 vllm/worker/cpu_model_runner.py       | 4 +---
 vllm/worker/embedding_model_runner.py | 5 ++---
 vllm/worker/neuron_model_runner.py    | 4 +---
 vllm/worker/tpu_model_runner.py       | 3 +--
 vllm/worker/xpu_model_runner.py       | 4 +---
 5 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index 982782355899..d8397ac22a58 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -365,10 +365,8 @@ def execute_model(
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if (self.vision_language_config
-                and model_input.multi_modal_kwargs is not None):
-            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)
 
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 0e1bb1bfe273..d3a2643cb62f 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -92,10 +92,9 @@ def execute_model(
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if self.vision_language_config:
-            multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-            execute_model_kwargs.update({"image_input": multi_modal_kwargs})
+
         hidden_states = model_executable(**execute_model_kwargs)
 
         # Only perform pooling in the driver worker.
diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py
index a95468110184..423f44085e31 100644
--- a/vllm/worker/neuron_model_runner.py
+++ b/vllm/worker/neuron_model_runner.py
@@ -239,13 +239,11 @@ def execute_model(
             raise ValueError(
                 "NeuronModelRunner does not support multi-step execution.")
 
-        multi_modal_kwargs = model_input.multi_modal_kwargs or {}
-
         hidden_states = self.model(
             input_ids=model_input.input_tokens,
             positions=model_input.input_positions,
             input_block_ids=model_input.input_block_ids,
-            **multi_modal_kwargs,
+            **(model_input.multi_modal_kwargs or {}),
         )
 
         # Compute the logits.
diff --git a/vllm/worker/tpu_model_runner.py b/vllm/worker/tpu_model_runner.py
index 29928e3a8da0..4ea8e62cc1fd 100644
--- a/vllm/worker/tpu_model_runner.py
+++ b/vllm/worker/tpu_model_runner.py
@@ -568,13 +568,12 @@ def forward(
             slot_mapping = slot_mapping.flatten()
             attn_metadata.slot_mapping = slot_mapping
 
-        multi_modal_kwargs = multi_modal_kwargs or {}
         hidden_states = self.model(
             token_ids,
             position_ids,
             kv_caches,
             attn_metadata,
-            **multi_modal_kwargs,
+            **(multi_modal_kwargs or {}),
         )
         hidden_states = hidden_states.flatten(0, 1)
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 568fa8791588..f4fc42328027 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -374,10 +374,8 @@ def execute_model(
             "positions": model_input.input_positions,
             "kv_caches": kv_caches,
             "attn_metadata": model_input.attn_metadata,
+            **(model_input.multi_modal_kwargs or {}),
         }
-        if (self.vision_language_config
-                and model_input.multi_modal_kwargs is not None):
-            execute_model_kwargs.update(model_input.multi_modal_kwargs)
 
         hidden_states = model_executable(**execute_model_kwargs)