From 4f7b21091d50e320a6481d8329bb44239829492d Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Tue, 25 Jun 2024 16:44:34 -0700 Subject: [PATCH 01/21] [VLM] Remove support for pixel_values and image_features. Signed-off-by: Xiaowei Jiang --- .buildkite/download-images.sh | 4 - docs/source/models/vlm.rst | 6 +- examples/llava_example.py | 55 ++------- examples/phi3v_example.py | 7 +- tests/conftest.py | 16 +-- tests/entrypoints/test_openai_vision.py | 2 - tests/models/test_llava.py | 19 ++- tests/models/test_llava_next.py | 20 ++-- tests/models/test_phi3v.py | 18 ++- tests/multimodal/test_processor.py | 40 +++---- tests/tokenization/test_image_processor.py | 20 ---- vllm/config.py | 36 +----- vllm/engine/arg_utils.py | 57 +-------- vllm/entrypoints/openai/api_server.py | 9 -- vllm/entrypoints/openai/serving_chat.py | 11 +- vllm/inputs.py | 6 +- vllm/model_executor/model_loader/loader.py | 2 +- vllm/model_executor/models/llava.py | 77 +++---------- vllm/model_executor/models/llava_next.py | 85 ++++---------- vllm/model_executor/models/phi3v.py | 16 +-- vllm/multimodal/image.py | 127 +++++---------------- vllm/multimodal/registry.py | 62 +++++----- vllm/multimodal/utils.py | 6 +- vllm/sequence.py | 8 +- vllm/transformers_utils/image_processor.py | 3 - vllm/worker/cpu_model_runner.py | 2 +- vllm/worker/model_runner.py | 2 +- 27 files changed, 194 insertions(+), 522 deletions(-) delete mode 100644 tests/tokenization/test_image_processor.py diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh index 389a12956c3c..360a7584bccf 100644 --- a/.buildkite/download-images.sh +++ b/.buildkite/download-images.sh @@ -8,10 +8,6 @@ set -o pipefail # aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/ mkdir -p images cd images -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt -wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 1837dd2aa89f..169265078c7f 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` llm = LLM( model="llava-hf/llava-1.5-7b-hf", - image_input_type="pixel_values", image_token_id=32000, image_input_shape="1,3,336,336", image_feature_size=576, @@ -49,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`. +* ``multi_modal_data``: This is a loosely structured dict that contains multi modal data. .. code-block:: python @@ -61,7 +60,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS outputs = llm.generate({ "prompt": prompt, - "multi_modal_data": ImagePixelData(image), + "multi_modal_data": {"image": image}, }) for o in outputs: @@ -93,7 +92,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with python -m vllm.entrypoints.openai.api_server \ --model llava-hf/llava-1.5-7b-hf \ - --image-input-type pixel_values \ --image-token-id 32000 \ --image-input-shape 1,3,336,336 \ --image-feature-size 576 \ diff --git a/examples/llava_example.py b/examples/llava_example.py index 980d7bf9f8a3..c4ddab299fa5 100644 --- a/examples/llava_example.py +++ b/examples/llava_example.py @@ -2,37 +2,32 @@ import os import subprocess -import torch from PIL import Image from vllm import LLM -from vllm.multimodal.image import ImageFeatureData, ImagePixelData # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`. # You can use `.buildkite/download-images.sh` to download them -def run_llava_pixel_values(*, disable_image_processor: bool = False): +def run_llava(): llm = LLM( model="llava-hf/llava-1.5-7b-hf", - image_input_type="pixel_values", image_token_id=32000, image_input_shape="1,3,336,336", image_feature_size=576, - disable_image_processor=disable_image_processor, ) prompt = "" * 576 + ( "\nUSER: What is the content of this image?\nASSISTANT:") - if disable_image_processor: - image = torch.load("images/stop_sign_pixel_values.pt") - else: - image = Image.open("images/stop_sign.jpg") + image = Image.open("images/stop_sign.jpg") outputs = llm.generate({ "prompt": prompt, - "multi_modal_data": ImagePixelData(image), + "multi_modal_data": { + "image": image + }, }) for o in outputs: @@ -40,45 +35,11 @@ def run_llava_pixel_values(*, disable_image_processor: bool = False): print(generated_text) -def run_llava_image_features(): - llm = LLM( - model="llava-hf/llava-1.5-7b-hf", - image_input_type="image_features", - image_token_id=32000, - image_input_shape="1,576,1024", - image_feature_size=576, - ) - - prompt = "" * 576 + ( - "\nUSER: What is the content of this image?\nASSISTANT:") - - image: torch.Tensor = torch.load("images/stop_sign_image_features.pt") - - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": ImageFeatureData(image), - }) - - for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) - - -def main(args): - if args.type == "pixel_values": - run_llava_pixel_values() - else: - run_llava_image_features() +def main(): + run_llava() if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Demo on Llava") - parser.add_argument("--type", - type=str, - choices=["pixel_values", "image_features"], - default="pixel_values", - help="image input type") - args = parser.parse_args() # Download from s3 s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/" local_directory = "images" @@ -95,4 +56,4 @@ def main(args): local_directory, "--no-sign-request", ]) - main(args) + main() diff --git a/examples/phi3v_example.py b/examples/phi3v_example.py index 4f37c47ddca8..7d93a6404837 100644 --- a/examples/phi3v_example.py +++ b/examples/phi3v_example.py @@ -4,7 +4,6 @@ from PIL import Image from vllm import LLM, SamplingParams -from vllm.multimodal.image import ImagePixelData def run_phi3v(): @@ -12,11 +11,9 @@ def run_phi3v(): llm = LLM( model=model_path, trust_remote_code=True, - image_input_type="pixel_values", image_token_id=32044, image_input_shape="1,3,1008,1344", image_feature_size=1921, - disable_image_processor=False, ) image = Image.open("images/cherry_blossom.jpg") @@ -30,7 +27,9 @@ def run_phi3v(): outputs = llm.generate( { "prompt": prompt, - "multi_modal_data": ImagePixelData(image), + "multi_modal_data": { + "image": image + }, }, sampling_params=sampling_params) for o in outputs: diff --git a/tests/conftest.py b/tests/conftest.py index 9d00c7676694..c3a4c63fd7ce 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,13 +17,13 @@ AutoProcessor, AutoTokenizer, BatchEncoding) from vllm import LLM, SamplingParams -from vllm.config import TokenizerPoolConfig, VisionLanguageConfig +from vllm.config import TokenizerPoolConfig from vllm.distributed import (destroy_distributed_environment, destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal import MultiModalData -from vllm.multimodal.image import ImageFeatureData, ImagePixelData +from vllm.multimodal.image import ImageData from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu @@ -62,16 +62,8 @@ def pil_image(self) -> Image.Image: def for_hf(self) -> Image.Image: return self.pil_image - def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData: - image_input_type = vision_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType - - if image_input_type == ImageInputType.IMAGE_FEATURES: - return ImageFeatureData(self.image_features) - if image_input_type == ImageInputType.PIXEL_VALUES: - return ImagePixelData(self.pil_image) - - raise NotImplementedError + def for_vllm(self) -> Dict[str, Any]: + return {"image": self.pil_image} class _ImageAssetPrompts(TypedDict): diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py index 0e8d88b76ffe..c59381dfb0b9 100644 --- a/tests/entrypoints/test_openai_vision.py +++ b/tests/entrypoints/test_openai_vision.py @@ -42,8 +42,6 @@ def server(): "--max-model-len", "4096", "--enforce-eager", - "--image-input-type", - "pixel_values", "--image-token-id", "32000", "--image-input-shape", diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index ac1d2ece62b2..e1dd57718690 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -24,17 +24,12 @@ def iter_llava_configs(model_name: str): } for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)), - ]: - yield (model_name, - VisionLanguageConfig(image_input_type=input_type, - image_feature_size=f, - image_token_id=32000, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None)) + input_shape = (1, 3, h, w) + yield (model_name, + VisionLanguageConfig(image_input_type=None, + image_feature_size=f, + image_token_id=32000, + image_input_shape=input_shape)) model_and_vl_config = [ @@ -82,7 +77,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, """ model_id, vlm_config = model_and_config hf_images = [asset.for_hf() for asset in image_assets] - vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] + vllm_images = [asset.for_vllm() for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index d36e503871ca..efab0a241044 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -32,16 +32,14 @@ def iter_llava_next_configs(model_name: str): } for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - ]: - yield (model_name, - VisionLanguageConfig(image_input_type=input_type, - image_feature_size=f, - image_token_id=32000, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None)) + input_shape = (1, 3, h, w) + yield (model_name, + VisionLanguageConfig( + image_input_type=None, + image_feature_size=f, + image_token_id=32000, + image_input_shape=input_shape, + )) model_and_vl_config = [ @@ -91,7 +89,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, """ model_id, vlm_config = model_and_config hf_images = [asset.for_hf() for asset in image_assets] - vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] + vllm_images = [asset.for_vllm() for asset in image_assets] with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model: hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 03c130466836..39275ee843e5 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -26,16 +26,12 @@ def iter_phi3v_configs(model_name: str): } for (h, w), f in image_hw_to_feature_size.items(): - for input_type, input_shape in [ - (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)), - ]: - yield (model_name, - VisionLanguageConfig(image_input_type=input_type, - image_feature_size=f, - image_token_id=32044, - image_input_shape=input_shape, - image_processor=model_name, - image_processor_revision=None)) + input_shape = (1, 3, h, w) + yield (model_name, + VisionLanguageConfig(image_input_type=None, + image_feature_size=f, + image_token_id=32044, + image_input_shape=input_shape)) model_and_vl_config = [ @@ -95,7 +91,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, """ model_id, vlm_config = model_and_config hf_images = [asset.for_hf() for asset in image_assets] - vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets] + vllm_images = [asset.for_vllm() for asset in image_assets] # use eager mode for hf runner, since phi3_v didn't work with flash_attn hf_model_kwargs = {"_attn_implementation": "eager"} diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py index 9ac48dfab678..52231c1b7b70 100644 --- a/tests/multimodal/test_processor.py +++ b/tests/multimodal/test_processor.py @@ -4,7 +4,7 @@ from vllm.config import ModelConfig, VisionLanguageConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import ImagePixelData +from vllm.multimodal.image import ImageData from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE @@ -27,12 +27,10 @@ def test_clip_image_processor(image_assets, dtype): revision=None, ) vlm_config = VisionLanguageConfig( - image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES, + image_input_type=None, image_token_id=32000, image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), image_feature_size=576, - image_processor=MODEL_NAME, - image_processor_revision=None, ) for asset in image_assets: @@ -41,7 +39,7 @@ def test_clip_image_processor(image_assets, dtype): return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(asset.pil_image), + ImageData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -75,14 +73,11 @@ def test_llava_next_image_processor(image_assets, dtype): dtype=dtype, revision=None, ) - vlm_config = VisionLanguageConfig( - image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES, - image_token_id=64000, - image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), - image_feature_size=2928, - image_processor=MODEL_NAME, - image_processor_revision=None, - ) + vlm_config = VisionLanguageConfig(image_input_type=None, + image_token_id=64000, + image_input_shape=(1, 3, IMAGE_HEIGHT, + IMAGE_WIDTH), + image_feature_size=2928) for asset in image_assets: hf_result = hf_processor.preprocess( @@ -90,7 +85,7 @@ def test_llava_next_image_processor(image_assets, dtype): return_tensors="pt", ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(asset.pil_image), + ImageData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) @@ -120,23 +115,20 @@ def test_image_pixel_types(image_assets, dtype): dtype=dtype, revision=None, ) - vlm_config = VisionLanguageConfig( - image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES, - image_token_id=32000, - image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), - image_feature_size=576, - image_processor=MODEL_NAME, - image_processor_revision=None, - ) + vlm_config = VisionLanguageConfig(image_input_type=None, + image_token_id=32000, + image_input_shape=(1, 3, IMAGE_HEIGHT, + IMAGE_WIDTH), + image_feature_size=576) for asset in image_assets: image_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(asset.pil_image), + ImageData(asset.pil_image), model_config=model_config, vlm_config=vlm_config, ) tensor_result = MULTIMODAL_REGISTRY.process_input( - ImagePixelData(asset.pixel_values), + ImageData(asset.pixel_values), model_config=model_config, vlm_config=vlm_config, ) diff --git a/tests/tokenization/test_image_processor.py b/tests/tokenization/test_image_processor.py deleted file mode 100644 index 5ba232336741..000000000000 --- a/tests/tokenization/test_image_processor.py +++ /dev/null @@ -1,20 +0,0 @@ -import pytest -from transformers.image_processing_utils import BaseImageProcessor - -from vllm.transformers_utils.image_processor import get_image_processor - -IMAGE_PROCESSOR_NAMES = [ - "llava-hf/llava-1.5-7b-hf", - "llava-hf/llava-v1.6-34b-hf", -] - - -@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES) -def test_image_processor_revision(processor_name: str): - # Assume that "main" branch always exists - image_processor = get_image_processor(processor_name, revision="main") - assert isinstance(image_processor, BaseImageProcessor) - - # Assume that "never" branch always does not exist - with pytest.raises(OSError, match='not a valid git identifier'): - get_image_processor(processor_name, revision="never") diff --git a/vllm/config.py b/vllm/config.py index 0c4d770e4684..368a41c3f329 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1154,28 +1154,13 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): raise ValueError("LoRA is not supported with chunked prefill yet.") +# TODO: To be replaced by MultiModalConfig. @dataclass class VisionLanguageConfig: """Configs the input data format and how models should run for vision language models.""" - class ImageInputType(enum.Enum): - """Image input type into the vision language model. - - An image roughly goes through the following transformation: - Raw image --> pixel values --> image features --> image embeddings. - - The difference between different image input types is where the - image encoder (pixel values --> image features) is run. - Different image input types also correspond to different tensor shapes. - - For example, for Llava, PIXEL_VALUES: (1, 3, 336, 336). - IMAGE_FEATURES: (1, 576, 1024). - """ - PIXEL_VALUES = enum.auto() - IMAGE_FEATURES = enum.auto() - - image_input_type: ImageInputType + image_input_type: None # The input id corresponding to image token. image_token_id: int # Used for running `run_prefill_max_token`. @@ -1183,19 +1168,6 @@ class ImageInputType(enum.Enum): # worst case scenario (biggest supported resolution). image_input_shape: tuple image_feature_size: int - # The image processor to load from HuggingFace - image_processor: Optional[str] - image_processor_revision: Optional[str] - - @classmethod - def get_image_input_enum_type(cls, value: str) -> ImageInputType: - """Get the image input type from a string.""" - try: - return cls.ImageInputType[value.upper()] - except KeyError as e: - raise ValueError(f"{value} is not a valid choice. " - f"Expecting to choose from " - f"{[x.name for x in cls.ImageInputType]}.") from e #TODO(ywang96): make this a cached property once we refactor the # VisionLanguageConfig class. @@ -1214,6 +1186,8 @@ def as_cli_args_dict(self) -> Dict[str, Any]: """ result: Dict[str, Any] = {} for f in fields(self): + if f.name == "image_input_type": + continue value = getattr(self, f.name) if isinstance(value, enum.Enum): result[f.name] = value.name.lower() @@ -1222,8 +1196,6 @@ def as_cli_args_dict(self) -> Dict[str, Any]: else: result[f.name] = value - result["disable_image_processor"] = self.image_processor is None - return result diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 16374098b23d..afbf0b33d4c8 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -1,7 +1,6 @@ import argparse import dataclasses import json -import warnings from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -80,13 +79,9 @@ class EngineArgs: preemption_mode: Optional[str] = None # Related to Vision-language models such as llava - image_input_type: Optional[str] = None image_token_id: Optional[int] = None image_input_shape: Optional[str] = None image_feature_size: Optional[int] = None - image_processor: Optional[str] = None - image_processor_revision: Optional[str] = None - disable_image_processor: bool = False scheduler_delay_factor: float = 0.0 enable_chunked_prefill: bool = False @@ -112,14 +107,6 @@ def __post_init__(self): @staticmethod def add_cli_args_for_vlm( parser: FlexibleArgumentParser) -> FlexibleArgumentParser: - parser.add_argument('--image-input-type', - type=nullable_str, - default=None, - choices=[ - t.name.lower() - for t in VisionLanguageConfig.ImageInputType - ], - help=('The image input type passed into vLLM.')) parser.add_argument('--image-token-id', type=int, default=None, @@ -135,24 +122,6 @@ def add_cli_args_for_vlm( type=int, default=None, help=('The image feature size along the context dimension.')) - parser.add_argument( - '--image-processor', - type=str, - default=EngineArgs.image_processor, - help='Name or path of the huggingface image processor to use. ' - 'If unspecified, model name or path will be used.') - parser.add_argument( - '--image-processor-revision', - type=str, - default=None, - help='Revision of the huggingface image processor version to use. ' - 'It can be a branch name, a tag name, or a commit id. ' - 'If unspecified, will use the default version.') - parser.add_argument( - '--disable-image-processor', - action='store_true', - help='Disables the use of image processor, even if one is defined ' - 'for the model on huggingface.') return parser @@ -742,33 +711,17 @@ def create_engine_config(self, ) -> EngineConfig: model_loader_extra_config=self.model_loader_extra_config, ) - if self.image_input_type: - if (not self.image_token_id or not self.image_input_shape - or not self.image_feature_size): + if self.image_token_id: + if (not self.image_input_shape or not self.image_feature_size): raise ValueError( - 'Specify `image_token_id`, `image_input_shape` and ' - '`image_feature_size` together with `image_input_type`.') - - if self.image_processor is None: - self.image_processor = self.model - if self.disable_image_processor: - if self.image_processor != self.model: - warnings.warn( - "You've specified an image processor " - f"({self.image_processor}) but also disabled " - "it via `--disable-image-processor`.", - stacklevel=2) - - self.image_processor = None + 'Specify `image_input_shape` and ' + '`image_feature_size` together with `image_token_id`.') vision_language_config = VisionLanguageConfig( - image_input_type=VisionLanguageConfig. - get_image_input_enum_type(self.image_input_type), + image_input_type=None, image_token_id=self.image_token_id, image_input_shape=str_to_int_tuple(self.image_input_shape), image_feature_size=self.image_feature_size, - image_processor=self.image_processor, - image_processor_revision=self.image_processor_revision, ) else: vision_language_config = None diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index ea6275920c79..b56d656bf610 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -184,15 +184,6 @@ async def authentication(request: Request, call_next): engine_args = AsyncEngineArgs.from_cli_args(args) - # Enforce pixel values as image input type for vision language models - # when serving with API server - if engine_args.image_input_type is not None and \ - engine_args.image_input_type.upper() != "PIXEL_VALUES": - raise ValueError( - f"Invalid image_input_type: {engine_args.image_input_type}. " - "Only --image-input-type 'pixel_values' is supported for serving " - "vision language models with the vLLM API server.") - engine = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 744e1d94511b..bd0d82545ca5 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -26,7 +26,7 @@ from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) -from vllm.multimodal.image import ImagePixelData +from vllm.multimodal.image import ImageData from vllm.multimodal.utils import (async_get_and_parse_image, get_full_image_text_prompt) from vllm.outputs import RequestOutput @@ -47,8 +47,7 @@ class ConversationMessage(TypedDict): @dataclass(frozen=True) class ChatMessageParseResult: messages: List[ConversationMessage] - image_futures: List[Awaitable[ImagePixelData]] = field( - default_factory=list) + image_futures: List[Awaitable[ImageData]] = field(default_factory=list) class OpenAIServingChat(OpenAIServing): @@ -103,7 +102,7 @@ def _parse_chat_message_content_parts( parts: Iterable[ChatCompletionContentPartParam], ) -> ChatMessageParseResult: texts: List[str] = [] - image_futures: List[Awaitable[ImagePixelData]] = [] + image_futures: List[Awaitable[ImageData]] = [] vlm_config: Optional[VisionLanguageConfig] = getattr( self.engine.engine, "vision_language_config", None) @@ -210,7 +209,7 @@ async def create_chat_completion( try: conversation: List[ConversationMessage] = [] - image_futures: List[Awaitable[ImagePixelData]] = [] + image_futures: List[Awaitable[ImageData]] = [] for msg in request.messages: chat_parsed_result = self._parse_chat_message_content(msg) @@ -228,7 +227,7 @@ async def create_chat_completion( return self.create_error_response(str(e)) # Fetch image data - image_data: Optional[ImagePixelData] = None + image_data: Optional[ImageData] = None try: if len(image_futures): # since we support only single image currently diff --git a/vllm/inputs.py b/vllm/inputs.py index 026903e19a26..518a342ada2c 100644 --- a/vllm/inputs.py +++ b/vllm/inputs.py @@ -1,5 +1,5 @@ -from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, - TypedDict, Union, cast, overload) +from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, + Sequence, TypedDict, Union, cast, overload) from typing_extensions import NotRequired @@ -127,4 +127,4 @@ class TextTokensPrompt(TypedDict): class LLMInputs(TypedDict): prompt_token_ids: List[int] prompt: NotRequired[Optional[str]] - multi_modal_data: NotRequired[Optional["MultiModalData"]] + multi_modal_data: NotRequired[Optional[Dict[str, Any]]] diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index d3babcf9c345..9acc38a3dcd4 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -79,7 +79,7 @@ def _get_model_initialization_kwargs( "please open an issue on github.") elif issubclass(model_class, VisionLanguageModelBase): if vision_language_config is None: - raise ValueError("Provide `image_input_type` and other vision " + raise ValueError("Provide vision " "related configurations through LLM entrypoint " "or engine arguments.") diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8e36c54b1c51..3679916f2752 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -1,4 +1,4 @@ -from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union +from typing import Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn @@ -74,17 +74,10 @@ class LlavaImagePixelInputs(TypedDict): """Shape: (batch_size, num_channels, height, width)""" -class LlavaImageFeatureInputs(TypedDict): - type: Literal["image_features"] - data: torch.Tensor - """Shape: (batch_size, image_feature_size, hidden_size)""" - +LlavaImageInputs = LlavaImagePixelInputs -LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs] - -@MULTIMODAL_REGISTRY.register_image_feature_input() -@MULTIMODAL_REGISTRY.register_image_pixel_input() +@MULTIMODAL_REGISTRY.register_image_input() @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class LlavaForConditionalGeneration(VisionLanguageModelBase): @@ -97,8 +90,8 @@ def __init__(self, self.config = config - if self.vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.PIXEL_VALUES): + # TODO: To be replaced by `multi_modal_config`. + if self.vision_language_config: self.vision_tower = CLIPVisionModel(config.vision_config) else: self.vision_tower = None @@ -137,44 +130,17 @@ def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor: def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaImageInputs]: pixel_values = kwargs.pop("pixel_values", None) - image_features = kwargs.pop("image_features", None) - - expected_input_type = self.vision_language_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType - - if expected_input_type == ImageInputType.PIXEL_VALUES: - if image_features is not None: - raise ValueError( - "Expected pixel values but got image features") - if pixel_values is None: - return None - - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - return LlavaImagePixelInputs( - type="pixel_values", - data=self._validate_image_data(pixel_values), - ) + if pixel_values is None: + return None - if expected_input_type == ImageInputType.IMAGE_FEATURES: - if pixel_values is not None: - raise ValueError( - "Expected image features but got pixel values") - if image_features is None: - return None + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") - if not isinstance(image_features, torch.Tensor): - raise ValueError("Incorrect type of image features. " - f"Got type: {type(image_features)}") - - return LlavaImageFeatureInputs( - type="image_features", - data=self._validate_image_data(image_features), - ) - - return None + return LlavaImagePixelInputs( + type="pixel_values", + data=self._validate_image_data(pixel_values), + ) def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: @@ -209,12 +175,8 @@ def _process_image_pixels(self, def _process_image_input(self, image_input: LlavaImageInputs) -> torch.Tensor: - if image_input["type"] == "pixel_values": - assert self.vision_tower is not None - image_features = self._process_image_pixels(image_input) - else: - image_features = image_input["data"] - + assert self.vision_tower is not None + image_features = self._process_image_pixels(image_input) return self.multi_modal_projector(image_features) def forward( @@ -245,19 +207,12 @@ def forward( This way, the `positions` and `attn_metadata` are consistent with the `input_ids`. - This model has two modes of image inputs: - `PIXEL_VALUES` and `IMAGE_FEATURES`. - Args: input_ids: Flattened (concatenated) input_ids corresponding to a batch. pixel_values: The pixels in each input image. Expects a batch with shape `[1, 3, 336, 336]`. (Only applicable to `PIXEL_VALUES` mode) - image_features: The image features for each input image outputted by - the vision tower before passing to the multi-modal projector. - Expects a batch with shape `[1, 576, 1024]`. - (Only applicable to `IMAGE_FEATURES` mode) See also: Each input maps to huggingface implementation, as follows: diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c1158c933c88..9617518d4e31 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -1,5 +1,4 @@ -from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict, - Union) +from typing import Dict, Iterable, List, Literal, Optional, Tuple, TypedDict import torch import torch.nn as nn @@ -22,7 +21,7 @@ from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData -from vllm.multimodal.image import ImagePixelData, get_dummy_image_data +from vllm.multimodal.image import ImageData, get_dummy_image_data from vllm.sequence import SamplerOutput, SequenceData from .llava import LlavaMultiModalProjector, merge_vision_embeddings @@ -45,17 +44,7 @@ class LlavaNextImagePixelInputs(TypedDict): """Shape: (batch_size, 2)""" -class LlavaNextImageFeatureInputs(TypedDict): - type: Literal["image_features"] - data: torch.Tensor - """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)""" - - image_sizes: NotRequired[torch.Tensor] - """Shape: (batch_size, 2)""" - - -LlavaNextImageInputs = Union[LlavaNextImagePixelInputs, - LlavaNextImageFeatureInputs] +LlavaNextImageInputs = LlavaNextImagePixelInputs def _get_dummy_image_data( @@ -66,19 +55,15 @@ def _get_dummy_image_data( seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config, vlm_config) - config_input_type = vlm_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType - - if config_input_type == ImageInputType.PIXEL_VALUES: - _, c, h, w = vlm_config.image_input_shape - mode = {1: "L", 3: "RGB"}[c] - fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0)) + _, c, h, w = vlm_config.image_input_shape + mode = {1: "L", 3: "RGB"}[c] + fake_mm_data = ImageData(Image.new(mode, (w, h), color=0)) return seq_data, fake_mm_data def _image_pixel_processor( - data: ImagePixelData, + data: ImageData, model_config: ModelConfig, vlm_config: VisionLanguageConfig, ) -> Dict[str, torch.Tensor]: @@ -100,11 +85,11 @@ def _image_pixel_processor( data.image = image.resize((w, h)) - return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ + return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \ ._default_input_processor(data, model_config, vlm_config) -@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor) +@MULTIMODAL_REGISTRY.register_image_input(_image_pixel_processor) @MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data) class LlavaNextForConditionalGeneration(VisionLanguageModelBase): @@ -118,11 +103,7 @@ def __init__(self, # Update the type annotation from that of its superclass self.config = config - if self.vision_language_config.image_input_type == ( - VisionLanguageConfig.ImageInputType.PIXEL_VALUES): - self.vision_tower = CLIPVisionModel(config=config.vision_config) - else: - raise TypeError("Image features are not supported by LLaVA-NeXT") + self.vision_tower = CLIPVisionModel(config=config.vision_config) self.multi_modal_projector = LlavaMultiModalProjector( vision_hidden_size=config.vision_config.hidden_size, @@ -175,36 +156,23 @@ def _parse_and_validate_image_input( self, **kwargs: object) -> Optional[LlavaNextImageInputs]: pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) - image_features = kwargs.pop("image_features", None) - expected_input_type = self.vision_language_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType + if pixel_values is None: + return None - if expected_input_type == ImageInputType.PIXEL_VALUES: - if image_features is not None: - raise ValueError( - "Expected pixel values but got image features") - if pixel_values is None: - return None - - if not isinstance(pixel_values, torch.Tensor): - raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") - - if not isinstance(image_sizes, torch.Tensor): - raise ValueError("Incorrect type of image sizes. " - f"Got type: {type(image_sizes)}") - - return LlavaNextImagePixelInputs( - type="pixel_values", - data=self._validate_image_pixels(pixel_values), - image_sizes=self._validate_image_sizes(image_sizes), - ) + if not isinstance(pixel_values, torch.Tensor): + raise ValueError("Incorrect type of pixel values. " + f"Got type: {type(pixel_values)}") - assert expected_input_type != ImageInputType.IMAGE_FEATURES, ( - "Failed to validate this at initialization time") + if not isinstance(image_sizes, torch.Tensor): + raise ValueError("Incorrect type of image sizes. " + f"Got type: {type(image_sizes)}") - return None + return LlavaNextImagePixelInputs( + type="pixel_values", + data=self._validate_image_pixels(pixel_values), + image_sizes=self._validate_image_sizes(image_sizes), + ) def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: @@ -311,11 +279,8 @@ def _process_image_pixels( def _process_image_input( self, image_input: LlavaNextImageInputs) -> torch.Tensor: - if image_input["type"] == "pixel_values": - assert self.vision_tower is not None - image_features = self._process_image_pixels(image_input) - else: - image_features = image_input["data"] + assert self.vision_tower is not None + image_features = self._process_image_pixels(image_input) patch_embeddings = self.multi_modal_projector(image_features) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index dac832a686c2..f3fa4dafc4f6 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -35,7 +35,7 @@ from vllm.model_executor.models.vlm_base import VisionLanguageModelBase from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import ImagePixelData, get_dummy_image_data +from vllm.multimodal.image import ImageData, get_dummy_image_data from vllm.sequence import SamplerOutput logger = init_logger(__name__) @@ -309,7 +309,7 @@ def calc_hd_transform_size(width, height, hd_num=16): def _image_processor( - data: ImagePixelData, + data: ImageData, model_config: ModelConfig, vlm_config: VisionLanguageConfig, ) -> Dict[str, torch.Tensor]: @@ -325,11 +325,11 @@ def _image_processor( data.image = image.resize((w, h)) - return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \ + return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \ ._default_input_processor(data, model_config, vlm_config) -@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_processor) +@MULTIMODAL_REGISTRY.register_image_input(_image_processor) @MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data) class Phi3VForCausalLM(VisionLanguageModelBase): @@ -352,14 +352,6 @@ def _parse_and_validate_image_input( pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) - expected_input_type = self.vision_language_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType - - if expected_input_type != ImageInputType.PIXEL_VALUES: - raise ValueError( - f"Unexpected image input type: {expected_input_type}." - "Phi3v only support pixel_values input currently.") - if pixel_values is not None and image_sizes is not None: return Phi3VImagePixelInputs(type="pixel_values", data=pixel_values, diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index 08fb09d11160..fe8b3d5ef9a9 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,4 +1,4 @@ -from typing import Dict, Tuple, Type, Union +from typing import Dict, Tuple, Type import torch from PIL import Image @@ -12,26 +12,24 @@ logger = init_logger(__name__) +IMAGE_TOKEN_ID = 32000 +IMAGE_FEATURE_SIZE = 576 +IMAGE_SHAPE = (336, 336) + +# TODO: All the reference to `vlm_config` will be updated to `mm_config`. +# TODO: This file should also be scoped to mm. def _get_dummy_seq_data(seq_len: int, vlm_config: VisionLanguageConfig) -> SequenceData: - # NOTE: We assume that token is repeated `image_feature_size` times - # and then concatenated with the text prompt - # TODO: Enable other ways of inserting the image into the prompt - - token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size - token_ids += [0] * (seq_len - vlm_config.image_feature_size) - + assert seq_len >= IMAGE_FEATURE_SIZE, ( + f"`seq_len` should be at least {IMAGE_FEATURE_SIZE}.") + token_ids = [IMAGE_TOKEN_ID] * IMAGE_FEATURE_SIZE + token_ids += [0] * (seq_len - IMAGE_FEATURE_SIZE) return SequenceData(token_ids) -def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor: - if vlm_config.image_processor is None: - values_dtype = torch.float16 - else: - values_dtype = torch.uint8 - - return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype) +def _get_dummy_image(vlm_config: VisionLanguageConfig) -> Image.Image: + return Image.new("RGB", IMAGE_SHAPE, color=(255, 255, 255)) def get_dummy_image_data( @@ -42,72 +40,41 @@ def get_dummy_image_data( """Standard dummy data factory for image data (to be used in :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`).""" seq_data = _get_dummy_seq_data(seq_len, vlm_config) - values = _get_dummy_values(vlm_config) + image = _get_dummy_image(vlm_config) - config_input_type = vlm_config.image_input_type - ImageInputType = VisionLanguageConfig.ImageInputType + return seq_data, ImageData(image) - fake_mm_data: MultiModalData - if config_input_type == ImageInputType.PIXEL_VALUES: - fake_mm_data = ImagePixelData(values) - elif config_input_type == ImageInputType.IMAGE_FEATURES: - fake_mm_data = ImageFeatureData(values) - else: - raise NotImplementedError - - return seq_data, fake_mm_data - - -class ImagePixelData(MultiModalData): - """ - The pixel data of an image. Can be one of: - - :class:``PIL.Image``: An image object. Requires that a HuggingFace - processor is available to the model. - - :class:``torch.Tensor``: The raw pixel data which is passed to the model - without additional pre-processing. +class ImageData(MultiModalData): + """An :class:``PIL.Image`` image. Requires that a HuggingFace + processor is available to the model. """ - def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None: - if isinstance(image, Image.Image): - # So that this class can be created inside the Image context manager - image.load() - + def __init__(self, image: Image.Image) -> None: + # So that this class can be created inside the Image context manager + image.load() self.image = image def __repr__(self) -> str: - image = self.image - if isinstance(image, Image.Image): - return f"{type(self).__name__}(image={image})" - - return (f"{type(self).__name__}(image=torch.Tensor(shape=" - f"{image.shape}, dtype={image.dtype}))") + return f"{type(self).__name__}(image={self.image})" -class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]): +class ImagePlugin(MultiModalPlugin[ImageData]): - def get_data_type(self) -> Type[ImagePixelData]: - return ImagePixelData - - def _get_hf_image_processor(self, model_config: ModelConfig, - vlm_config: VisionLanguageConfig): - if vlm_config is None or vlm_config.image_processor is None: - return None + def get_data_type(self) -> Type[ImageData]: + return ImageData + def _get_hf_image_processor(self, model_config: ModelConfig): return cached_get_image_processor( - vlm_config.image_processor, - trust_remote_code=model_config.trust_remote_code, - revision=vlm_config.image_processor_revision, - ) + model_config.model, + trust_remote_code=model_config.trust_remote_code) def _default_input_processor( - self, data: ImagePixelData, model_config: ModelConfig, + self, data: ImageData, model_config: ModelConfig, vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]: image = data.image - if isinstance(image, Image.Image): - image_processor = self._get_hf_image_processor( - model_config, vlm_config) + image_processor = self._get_hf_image_processor(model_config) if image_processor is None: raise RuntimeError("No HuggingFace processor is available" "to process the image object") @@ -117,39 +84,5 @@ def _default_input_processor( except Exception: logger.error("Failed to process image (%s)", image) raise - elif isinstance(image, torch.Tensor): - pixel_values = image.to(model_config.dtype) - - return {"pixel_values": pixel_values} raise TypeError(f"Invalid image type: {type(image)}") - - -class ImageFeatureData(MultiModalData): - """ - The feature vector of an image, passed directly to the model. - - This should be the output of the vision tower. - """ - - def __init__(self, image_features: torch.Tensor) -> None: - self.image_features = image_features - - def __repr__(self) -> str: - image_features = self.image_features - - return (f"{type(self).__name__}(image_features=torch.Tensor(shape=" - f"{image_features.shape}, dtype={image_features.dtype}))") - - -class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]): - - def get_data_type(self) -> Type[ImageFeatureData]: - return ImageFeatureData - - def _default_input_processor( - self, data: ImageFeatureData, model_config: ModelConfig, - vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]: - image_features = data.image_features.to(model_config.dtype) - - return {"image_features": image_features} diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 4789ce5ce4cf..189346360201 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,13 +1,14 @@ import functools from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence, - Tuple, Type, TypeVar) + Tuple, Type, TypeVar, Union) + +from PIL import Image from vllm.config import ModelConfig, VisionLanguageConfig from vllm.logger import init_logger from .base import MultiModalData, MultiModalPlugin -from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData, - ImagePixelPlugin) +from .image import ImageData, ImagePlugin if TYPE_CHECKING: import torch @@ -32,7 +33,7 @@ class MultiModalRegistry: according to its modality and the target model. """ - DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin()) + DEFAULT_PLUGINS = (ImagePlugin(), ) def __init__(self, *, @@ -53,7 +54,17 @@ def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None: self._plugins_by_data_type[data_type] = plugin - def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]): + def _process_external_input(self, data, model_config: ModelConfig, + vlm_config: VisionLanguageConfig): + if isinstance(data, Image.Image): + return self._get_plugin_for_internal_data_type( + ImageData).process_input(ImageData(data), model_config, + vlm_config) + msg = f"Unknown multi-modal data type: {type(data)}" + raise NotImplementedError(msg) + + def _get_plugin_for_internal_data_type(self, + data_type: Type[MultiModalData]): for typ in data_type.mro(): plugin = self._plugins_by_data_type.get(typ) if plugin is not None: @@ -105,41 +116,40 @@ def register_input( See :meth:`MultiModalPlugin.register_input_processor` for more details. """ - return self._get_plugin_for_data_type(data_type) \ + return self._get_plugin_for_internal_data_type(data_type) \ .register_input_processor(processor) - def register_image_pixel_input( + def register_image_input( self, - processor: Optional[ - MultiModalInputProcessor[ImagePixelData]] = None): + processor: Optional[MultiModalInputProcessor[ImageData]] = None): """ Register an input processor for image pixel data to a model class. See :meth:`MultiModalPlugin.register_input_processor` for more details. """ - return self.register_input(ImagePixelData, processor) - - def register_image_feature_input( - self, - processor: Optional[ - MultiModalInputProcessor[ImageFeatureData]] = None): - """ - Register an input processor for image feature data to a model class. + return self.register_input(ImageData, processor) - See :meth:`MultiModalPlugin.register_input_processor` for more details. - """ - return self.register_input(ImageFeatureData, processor) - - def process_input(self, data: MultiModalData, model_config: ModelConfig, + def process_input(self, data: Union[MultiModalData, Dict[str, Any]], + model_config: ModelConfig, vlm_config: VisionLanguageConfig): """ - Apply an input processor to a :class:`~MultiModalData` instance passed - to the model. + Apply an input processor before passing in to the model. + + If the data is internally supplied (for profiling), + it's of type :class:`~MultiModalData`. + If externally supplied through user API, it's of type dict. See :meth:`MultiModalPlugin.process_input` for more details. """ - return self._get_plugin_for_data_type(type(data)) \ - .process_input(data, model_config, vlm_config) + if isinstance(data, MultiModalData): + return self._get_plugin_for_internal_data_type(type(data)) \ + .process_input(data, model_config, vlm_config) + else: + result_list = [ + self._process_external_input(d, model_config, vlm_config) + for d in data.values() + ] + return {k: v for d in result_list for k, v in d.items()} def create_input_processor(self, model_config: ModelConfig, vlm_config: VisionLanguageConfig): diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 0cf2c057f892..c80cd7adde58 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -8,7 +8,7 @@ from vllm.config import ModelConfig from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT -from vllm.multimodal.image import ImagePixelData +from vllm.multimodal.image import ImageData class ImageFetchAiohttp: @@ -56,9 +56,9 @@ async def fetch_image(cls, image_url: str) -> Image.Image: return image -async def async_get_and_parse_image(image_url: str) -> ImagePixelData: +async def async_get_and_parse_image(image_url: str) -> ImageData: with await ImageFetchAiohttp.fetch_image(image_url) as image: - return ImagePixelData(image) + return ImageData(image) def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str: diff --git a/vllm/sequence.py b/vllm/sequence.py index 0925d15461fd..cc1ccb7f3cf0 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -3,7 +3,7 @@ import enum from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -257,8 +257,8 @@ def prompt_token_ids(self) -> List[int]: return self.inputs["prompt_token_ids"] @property - def multi_modal_data(self) -> Optional["MultiModalData"]: - return self.inputs.get("multi_modal_data") + def multi_modal_data(self) -> Dict[str, Any]: + return self.inputs.get("multi_modal_data") or {} @property def lora_int_id(self) -> int: @@ -640,7 +640,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, - multi_modal_data: Optional["MultiModalData"] = None, + multi_modal_data: Optional[Dict[str, Any]] = None, encoder_seq_data: Optional[SequenceData] = None, cross_block_table: Optional[List[int]] = None, ) -> None: diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py index 3239b1d0cfa2..265a8ec99efe 100644 --- a/vllm/transformers_utils/image_processor.py +++ b/vllm/transformers_utils/image_processor.py @@ -1,5 +1,4 @@ from functools import lru_cache -from typing import Optional from transformers import AutoImageProcessor from transformers.image_processing_utils import BaseImageProcessor @@ -13,7 +12,6 @@ def get_image_processor( processor_name: str, *args, trust_remote_code: bool = False, - revision: Optional[str] = None, **kwargs, ) -> BaseImageProcessor: """Gets an image processor for the given model name via HuggingFace.""" @@ -22,7 +20,6 @@ def get_image_processor( processor_name, *args, trust_remote_code=trust_remote_code, - revision=revision, **kwargs) except ValueError as e: # If the error pertains to the processor class not existing or not diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py index e3464c0d3900..f01c7d74ebe3 100644 --- a/vllm/worker/cpu_model_runner.py +++ b/vllm/worker/cpu_model_runner.py @@ -167,7 +167,7 @@ def _prepare_prompt( input_positions.extend(list(range(computed_len, seq_len))) mm_data = seq_group_metadata.multi_modal_data - if mm_data is not None: + if mm_data: # Process multi-modal data if self.multi_modal_input_processor is None: raise ValueError( diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py index 9fdb2ea5dd4e..3f0b455aabab 100644 --- a/vllm/worker/model_runner.py +++ b/vllm/worker/model_runner.py @@ -509,7 +509,7 @@ def _prepare_model_input_tensors( is not None else 1)) mm_data = seq_group_metadata.multi_modal_data - if mm_data is not None: + if mm_data: # Process multi-modal data if self.multi_modal_input_processor is None: raise ValueError( From 174ca90ad10362751e9ccb2138a31d83fdb6c6e4 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Wed, 26 Jun 2024 12:51:27 -0700 Subject: [PATCH 02/21] address comments Signed-off-by: Xiaowei Jiang --- examples/llava_example.py | 1 - tests/conftest.py | 1 - vllm/inputs.py | 9 ++++--- vllm/multimodal/__init__.py | 9 ++++--- vllm/multimodal/base.py | 30 +++++++++++++++------ vllm/multimodal/image.py | 5 +++- vllm/multimodal/registry.py | 52 +++++++++++++++++++++++++------------ vllm/sequence.py | 8 +++--- 8 files changed, 76 insertions(+), 39 deletions(-) diff --git a/examples/llava_example.py b/examples/llava_example.py index c4ddab299fa5..7f3d84f99f76 100644 --- a/examples/llava_example.py +++ b/examples/llava_example.py @@ -1,4 +1,3 @@ -import argparse import os import subprocess diff --git a/tests/conftest.py b/tests/conftest.py index c3a4c63fd7ce..eff34e9d2937 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -23,7 +23,6 @@ from vllm.inputs import TextPrompt from vllm.logger import init_logger from vllm.multimodal import MultiModalData -from vllm.multimodal.image import ImageData from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu diff --git a/vllm/inputs.py b/vllm/inputs.py index 518a342ada2c..71487e89a97c 100644 --- a/vllm/inputs.py +++ b/vllm/inputs.py @@ -1,10 +1,10 @@ -from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, - Sequence, TypedDict, Union, cast, overload) +from typing import (TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, + TypedDict, Union, cast, overload) from typing_extensions import NotRequired if TYPE_CHECKING: - from vllm.multimodal import MultiModalData + from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData class ParsedText(TypedDict): @@ -125,6 +125,7 @@ class TextTokensPrompt(TypedDict): class LLMInputs(TypedDict): + """A structured class to construct :class:`Sequence` with. """ prompt_token_ids: List[int] prompt: NotRequired[Optional[str]] - multi_modal_data: NotRequired[Optional[Dict[str, Any]]] + multi_modal_data: NotRequired[Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]]] diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 270012e7d1c3..e2384ef5f8ad 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,7 +1,10 @@ -from .base import MultiModalData, MultiModalPlugin +from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry __all__ = [ - "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY", - "MultiModalRegistry" + "MultiModalData", + "MultiModalPlugin", + "MULTIMODAL_REGISTRY", + "MultiModalRegistry", + "EXTERNAL_MM_DATA_TYPE", ] diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 847752449ba8..da68baf7f3d4 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,12 +1,13 @@ from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type, - TypeVar) +from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Tuple, + Type, TypeVar, Union) from vllm.config import ModelConfig, VisionLanguageConfig from vllm.logger import init_logger if TYPE_CHECKING: import torch + from PIL import Image from torch import nn logger = init_logger(__name__) @@ -22,7 +23,7 @@ class MultiModalData: :class:`~MultiModalPlugin`. Finally, register the new plugin to - :const:`vllm.multimodal.MULTIMODAL_REGISTRY`. + :const:`vllm.multimodal.MULTIMODAL_REGISTRY` (beyond the default plugins). This enables models to call :meth:`MultiModalRegistry.register_input` for the new modality. """ @@ -32,6 +33,8 @@ class MultiModalData: D = TypeVar("D", bound=MultiModalData) N = TypeVar("N", bound=Type["nn.Module"]) +EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"] + MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig], Dict[str, "torch.Tensor"]] """Return a dictionary to be passed as keyword arguments to @@ -62,13 +65,23 @@ def __init__(self) -> None: MultiModalInputProcessor[D]] = {} @abstractmethod - def get_data_type(self) -> Type[D]: + def get_internal_data_type(self) -> Type[D]: """ Get the modality (subclass of :class:`~MultiModalData`) served by this plugin. """ raise NotImplementedError + @abstractmethod + def get_external_data_type(self) -> Tuple[str, EXTERNAL_MM_DATA_TYPE]: + """The data type that this plugin handles. + + For `LLM.generate(multi_modal_data={"key": value})` will + be handled by plugin with an external data type of + (key, type(value)). + """ + raise NotImplementedError + @abstractmethod def _default_input_processor( self, data: D, model_config: ModelConfig, @@ -85,10 +98,11 @@ def register_input_processor(self, """ Register an input processor to a model class. - When the model receives input data that matches the modality served by - this plugin (see :meth:`get_data_type`), the provided input processor is - applied to preprocess the data. If `None` is provided, then the default - input processor is applied instead. + When LLM receives input data that matches the modality served by + this plugin (see :meth:`get_internal_data_type`), the provided input + processor is applied to preprocess the data. + If `None` is provided, then the default input processor is applied + instead. """ def wrapper(model_cls: N) -> N: diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index fe8b3d5ef9a9..e32e761bb95f 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -61,9 +61,12 @@ def __repr__(self) -> str: class ImagePlugin(MultiModalPlugin[ImageData]): - def get_data_type(self) -> Type[ImageData]: + def get_internal_data_type(self) -> Type[ImageData]: return ImageData + def get_external_data_type(self) -> Tuple[str, Type[Image.Image]]: + return ("image", Image.Image) + def _get_hf_image_processor(self, model_config: ModelConfig): return cached_get_image_processor( model_config.model, diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 189346360201..e479384dff2c 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -2,12 +2,10 @@ from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence, Tuple, Type, TypeVar, Union) -from PIL import Image - from vllm.config import ModelConfig, VisionLanguageConfig from vllm.logger import init_logger -from .base import MultiModalData, MultiModalPlugin +from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin from .image import ImageData, ImagePlugin if TYPE_CHECKING: @@ -31,6 +29,8 @@ class MultiModalRegistry: """ This registry is used by model runners to dispatch data processing according to its modality and the target model. + + The registry handles both external and internal data input. """ DEFAULT_PLUGINS = (ImagePlugin(), ) @@ -39,34 +39,51 @@ def __init__(self, *, plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS ) -> None: - self._plugins_by_data_type = {p.get_data_type(): p for p in plugins} + self._plugins_by_internal_data_type = { + p.get_internal_data_type(): p + for p in plugins + } + self._plugins_by_external_data_type = { + p.get_external_data_type(): p + for p in plugins + } self._dummy_factories_by_model_type: Dict[Type["nn.Module"], MultiModalDummyFactory] = {} def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None: - data_type = plugin.get_data_type() + data_type = plugin.get_internal_data_type() - if data_type in self._plugins_by_data_type: + if data_type in self._plugins_by_internal_data_type: logger.warning( "A plugin is already registered for data type %s, " "and will be overwritten by the new plugin %s.", data_type, plugin) - self._plugins_by_data_type[data_type] = plugin + self._plugins_by_internal_data_type[data_type] = plugin - def _process_external_input(self, data, model_config: ModelConfig, + def _process_external_input(self, key, value, model_config: ModelConfig, vlm_config: VisionLanguageConfig): - if isinstance(data, Image.Image): - return self._get_plugin_for_internal_data_type( - ImageData).process_input(ImageData(data), model_config, - vlm_config) - msg = f"Unknown multi-modal data type: {type(data)}" + plugin = self._get_plugin_for_external_data_type(key, type(value)) + if plugin: + return plugin.process_input(plugin.get_internal_data_type()(value), + model_config, vlm_config) + msg = f"Unknown multi-modal data type: {type(value)}" + raise NotImplementedError(msg) + + def _get_plugin_for_external_data_type(self, key: str, + data_type: Type[Any]): + for typ in data_type.mro(): + plugin = self._plugins_by_external_data_type.get((key, typ)) + if plugin is not None: + return plugin + + msg = f"Unknown multi-modal data type: {data_type}" raise NotImplementedError(msg) def _get_plugin_for_internal_data_type(self, data_type: Type[MultiModalData]): for typ in data_type.mro(): - plugin = self._plugins_by_data_type.get(typ) + plugin = self._plugins_by_internal_data_type.get(typ) if plugin is not None: return plugin @@ -129,7 +146,8 @@ def register_image_input( """ return self.register_input(ImageData, processor) - def process_input(self, data: Union[MultiModalData, Dict[str, Any]], + def process_input(self, data: Union[MultiModalData, + Dict[str, EXTERNAL_MM_DATA_TYPE]], model_config: ModelConfig, vlm_config: VisionLanguageConfig): """ @@ -146,8 +164,8 @@ def process_input(self, data: Union[MultiModalData, Dict[str, Any]], .process_input(data, model_config, vlm_config) else: result_list = [ - self._process_external_input(d, model_config, vlm_config) - for d in data.values() + self._process_external_input(k, v, model_config, vlm_config) + for k, v in data.items() ] return {k: v for d in result_list for k, v in d.items()} diff --git a/vllm/sequence.py b/vllm/sequence.py index cc1ccb7f3cf0..ebc56f65c465 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -3,7 +3,7 @@ import enum from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union import torch @@ -14,7 +14,7 @@ from vllm.sampling_params import SamplingParams if TYPE_CHECKING: - from vllm.multimodal import MultiModalData + from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -257,7 +257,7 @@ def prompt_token_ids(self) -> List[int]: return self.inputs["prompt_token_ids"] @property - def multi_modal_data(self) -> Dict[str, Any]: + def multi_modal_data(self) -> Dict[str, "EXTERNAL_MM_DATA_TYPE"]: return self.inputs.get("multi_modal_data") or {} @property @@ -640,7 +640,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, - multi_modal_data: Optional[Dict[str, Any]] = None, + multi_modal_data: Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]] = None, encoder_seq_data: Optional[SequenceData] = None, cross_block_table: Optional[List[int]] = None, ) -> None: From 5b3e9aae8a2eccd69d5e6d6d8d602721489be470 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Wed, 26 Jun 2024 13:23:58 -0700 Subject: [PATCH 03/21] remove image_input_type altogether. Signed-off-by: Xiaowei Jiang --- tests/models/test_llava.py | 3 +-- tests/models/test_llava_next.py | 1 - tests/models/test_phi3v.py | 3 +-- tests/multimodal/test_processor.py | 7 ++----- vllm/config.py | 4 ---- vllm/engine/arg_utils.py | 1 - 6 files changed, 4 insertions(+), 15 deletions(-) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index e1dd57718690..17cec7fc61ff 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -26,8 +26,7 @@ def iter_llava_configs(model_name: str): for (h, w), f in image_hw_to_feature_size.items(): input_shape = (1, 3, h, w) yield (model_name, - VisionLanguageConfig(image_input_type=None, - image_feature_size=f, + VisionLanguageConfig(image_feature_size=f, image_token_id=32000, image_input_shape=input_shape)) diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index efab0a241044..0cf2942e94c7 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -35,7 +35,6 @@ def iter_llava_next_configs(model_name: str): input_shape = (1, 3, h, w) yield (model_name, VisionLanguageConfig( - image_input_type=None, image_feature_size=f, image_token_id=32000, image_input_shape=input_shape, diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 39275ee843e5..e9cc5e826296 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -28,8 +28,7 @@ def iter_phi3v_configs(model_name: str): for (h, w), f in image_hw_to_feature_size.items(): input_shape = (1, 3, h, w) yield (model_name, - VisionLanguageConfig(image_input_type=None, - image_feature_size=f, + VisionLanguageConfig(image_feature_size=f, image_token_id=32044, image_input_shape=input_shape)) diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py index 52231c1b7b70..12f37e431cae 100644 --- a/tests/multimodal/test_processor.py +++ b/tests/multimodal/test_processor.py @@ -27,7 +27,6 @@ def test_clip_image_processor(image_assets, dtype): revision=None, ) vlm_config = VisionLanguageConfig( - image_input_type=None, image_token_id=32000, image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), image_feature_size=576, @@ -73,8 +72,7 @@ def test_llava_next_image_processor(image_assets, dtype): dtype=dtype, revision=None, ) - vlm_config = VisionLanguageConfig(image_input_type=None, - image_token_id=64000, + vlm_config = VisionLanguageConfig(image_token_id=64000, image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), image_feature_size=2928) @@ -115,8 +113,7 @@ def test_image_pixel_types(image_assets, dtype): dtype=dtype, revision=None, ) - vlm_config = VisionLanguageConfig(image_input_type=None, - image_token_id=32000, + vlm_config = VisionLanguageConfig(image_token_id=32000, image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), image_feature_size=576) diff --git a/vllm/config.py b/vllm/config.py index 368a41c3f329..cfaef307a19e 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1159,8 +1159,6 @@ def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig): class VisionLanguageConfig: """Configs the input data format and how models should run for vision language models.""" - - image_input_type: None # The input id corresponding to image token. image_token_id: int # Used for running `run_prefill_max_token`. @@ -1186,8 +1184,6 @@ def as_cli_args_dict(self) -> Dict[str, Any]: """ result: Dict[str, Any] = {} for f in fields(self): - if f.name == "image_input_type": - continue value = getattr(self, f.name) if isinstance(value, enum.Enum): result[f.name] = value.name.lower() diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index afbf0b33d4c8..03d527d5ce90 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -718,7 +718,6 @@ def create_engine_config(self, ) -> EngineConfig: '`image_feature_size` together with `image_token_id`.') vision_language_config = VisionLanguageConfig( - image_input_type=None, image_token_id=self.image_token_id, image_input_shape=str_to_int_tuple(self.image_input_shape), image_feature_size=self.image_feature_size, From b7acf3a0b943bcc6618a924d14388878fa57ad5a Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Wed, 26 Jun 2024 14:28:12 -0700 Subject: [PATCH 04/21] types Signed-off-by: Xiaowei Jiang --- docs/source/models/vlm.rst | 2 +- vllm/multimodal/base.py | 2 +- vllm/multimodal/registry.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 169265078c7f..d4613be64aa5 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This is a loosely structured dict that contains multi modal data. +* ``multi_modal_data``: This is a dictionary that contains multi-modal data. .. code-block:: python diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index da68baf7f3d4..4ddd8b53dd83 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -73,7 +73,7 @@ def get_internal_data_type(self) -> Type[D]: raise NotImplementedError @abstractmethod - def get_external_data_type(self) -> Tuple[str, EXTERNAL_MM_DATA_TYPE]: + def get_external_data_type(self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]: """The data type that this plugin handles. For `LLM.generate(multi_modal_data={"key": value})` will diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index e479384dff2c..ed20c6fd59c1 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -77,7 +77,7 @@ def _get_plugin_for_external_data_type(self, key: str, if plugin is not None: return plugin - msg = f"Unknown multi-modal data type: {data_type}" + msg = f"No plugin found for key {key} and type {data_type}" raise NotImplementedError(msg) def _get_plugin_for_internal_data_type(self, From f22b2198cf99b6d1cd2c5067d0fc1d834d746ec4 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Wed, 26 Jun 2024 14:48:30 -0700 Subject: [PATCH 05/21] format Signed-off-by: Xiaowei Jiang --- vllm/multimodal/base.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4ddd8b53dd83..233068c4a545 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -73,7 +73,8 @@ def get_internal_data_type(self) -> Type[D]: raise NotImplementedError @abstractmethod - def get_external_data_type(self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]: + def get_external_data_type( + self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]: """The data type that this plugin handles. For `LLM.generate(multi_modal_data={"key": value})` will From f84b793faf61086244ac1c19b67e4932568abb01 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Fri, 28 Jun 2024 08:22:57 -0700 Subject: [PATCH 06/21] format Signed-off-by: Xiaowei Jiang --- tests/multimodal/test_mapper.py | 18 +----------------- vllm/model_executor/models/llava.py | 4 +--- vllm/model_executor/models/llava_next.py | 4 ++-- vllm/multimodal/__init__.py | 2 +- vllm/multimodal/base.py | 1 - vllm/multimodal/registry.py | 17 ++++++++++------- vllm/transformers_utils/image_processor.py | 2 -- 7 files changed, 15 insertions(+), 33 deletions(-) diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 3a5049f1ba7c..2238b0e9f9f7 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -2,7 +2,7 @@ import pytest from transformers import CLIPImageProcessor, LlavaNextImageProcessor -from vllm.config import ModelConfig, VisionLanguageConfig +from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.multimodal.image import ImageData @@ -12,7 +12,6 @@ @pytest.mark.parametrize("dtype", ["half", "float"]) def test_clip_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" - IMAGE_HEIGHT = IMAGE_WIDTH = 560 hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME) assert isinstance(hf_processor, CLIPImageProcessor) @@ -26,11 +25,6 @@ def test_clip_image_processor(image_assets, dtype): dtype=dtype, revision=None, ) - multimodal_config=VisionLanguageConfig( - image_token_id=32000, - image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH), - image_feature_size=576, - ) for asset in image_assets: hf_result = hf_processor.preprocess( @@ -57,7 +51,6 @@ def test_clip_image_processor(image_assets, dtype): @pytest.mark.parametrize("dtype", ["half", "float"]) def test_llava_next_image_processor(image_assets, dtype): MODEL_NAME = "llava-hf/llava-v1.6-34b-hf" - IMAGE_HEIGHT = IMAGE_WIDTH = 560 hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME) assert isinstance(hf_processor, LlavaNextImageProcessor) @@ -71,10 +64,6 @@ def test_llava_next_image_processor(image_assets, dtype): dtype=dtype, revision=None, ) - multimodal_config = VisionLanguageConfig(image_token_id=64000, - image_input_shape=(1, 3, IMAGE_HEIGHT, - IMAGE_WIDTH), - image_feature_size=2928) for asset in image_assets: hf_result = hf_processor.preprocess( @@ -100,7 +89,6 @@ def test_llava_next_image_processor(image_assets, dtype): @pytest.mark.parametrize("dtype", ["float"]) def test_image_pixel_types(image_assets, dtype): MODEL_NAME = "llava-hf/llava-1.5-7b-hf" - IMAGE_HEIGHT = IMAGE_WIDTH = 560 model_config = ModelConfig( model=MODEL_NAME, @@ -111,10 +99,6 @@ def test_image_pixel_types(image_assets, dtype): dtype=dtype, revision=None, ) - multimodal_config = VisionLanguageConfig(image_token_id=32000, - image_input_shape=(1, 3, IMAGE_HEIGHT, - IMAGE_WIDTH), - image_feature_size=576) for asset in image_assets: image_result = MULTIMODAL_REGISTRY.map_input( model_config, diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 87dab86f4941..72d39bb124ef 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -79,7 +79,6 @@ class LlavaImagePixelInputs(TypedDict): def dummy_data_for_llava(ctx: InputContext, seq_len: int): - multimodal_config = ctx.get_multimodal_config() hf_config = ctx.get_hf_config(LlavaConfig) vision_config = hf_config.vision_config @@ -153,14 +152,13 @@ def _parse_and_validate_image_input( if not isinstance(pixel_values, torch.Tensor): raise ValueError("Incorrect type of pixel values. " - f"Got type: {type(pixel_values)}") + f"Got type: {type(pixel_values)}") return LlavaImagePixelInputs( type="pixel_values", data=self._validate_image_data(pixel_values), ) - def _select_image_features(self, image_features: torch.Tensor, *, strategy: str) -> torch.Tensor: # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421 # noqa diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index a65376eedcbe..dfdf512b968a 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -24,8 +24,8 @@ from vllm.multimodal.image import ImageData from vllm.sequence import SamplerOutput -from .clip import (dummy_pixel_data_for_clip, - dummy_seq_data_for_clip, get_clip_patch_grid_length) +from .clip import (dummy_pixel_data_for_clip, dummy_seq_data_for_clip, + get_clip_patch_grid_length) from .interfaces import SupportsVision from .llava import LlavaMultiModalProjector, merge_vision_embeddings diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index 1d76d2d82b48..a4ccfea0783e 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,4 @@ -from .base import MultiModalData, MultiModalPlugin, EXTERNAL_MM_DATA_TYPE +from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 4fc2f480378b..6e1bf2fea385 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -36,7 +36,6 @@ class MultiModalData: EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"] MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]] - """Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers and processors in HuggingFace Transformers.""" diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 4900823b2ef7..d1fa5c1f58bf 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,7 +6,8 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin, MultiModalInputMapper +from .base import (EXTERNAL_MM_DATA_TYPE, MultiModalData, + MultiModalInputMapper, MultiModalPlugin) from .image import ImageData, ImagePlugin logger = init_logger(__name__) @@ -63,7 +64,8 @@ def register_image_input_mapper( def _process_external_input(self, key, value, model_config: ModelConfig): plugin = self._get_plugin_for_external_data_type(key, type(value)) if plugin: - return plugin.map_input(model_config, plugin.get_internal_data_type()(value)) + return plugin.map_input(model_config, + plugin.get_internal_data_type()(value)) msg = f"Unknown multi-modal data type: {type(value)}" raise NotImplementedError(msg) @@ -100,9 +102,9 @@ def register_input_mapper( return self._get_plugin_for_internal_data_type(data_type) \ .register_input_mapper(mapper) - def register_image_input( - self, - mapper: Optional[MultiModalInputMapper[ImageData]] = None): + def register_image_input(self, + mapper: Optional[ + MultiModalInputMapper[ImageData]] = None): """ Register an input mapper for image pixel data to a model class. @@ -110,8 +112,9 @@ def register_image_input( """ return self.register_input_mapper(ImageData, mapper) - def map_input(self, model_config: ModelConfig, data: Union[MultiModalData, - Dict[str, EXTERNAL_MM_DATA_TYPE]]): + def map_input(self, model_config: ModelConfig, + data: Union[MultiModalData, Dict[str, + EXTERNAL_MM_DATA_TYPE]]): """ Apply an input mapper to a :class:`~MultiModalData` instance passed to the model. diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py index af1d772fc466..354dcb526395 100644 --- a/vllm/transformers_utils/image_processor.py +++ b/vllm/transformers_utils/image_processor.py @@ -1,5 +1,3 @@ -from typing import Optional - from transformers import AutoImageProcessor from transformers.image_processing_utils import BaseImageProcessor From a934663fc33561b251df1c7cbd361fe734e97014 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Fri, 28 Jun 2024 09:57:19 -0700 Subject: [PATCH 07/21] ExternalMultiModalDataDict Signed-off-by: Xiaowei Jiang --- vllm/inputs/data.py | 6 +++--- vllm/multimodal/__init__.py | 4 ++-- vllm/multimodal/base.py | 18 +++++++++++++----- vllm/multimodal/registry.py | 7 +++---- vllm/sequence.py | 6 +++--- 5 files changed, 24 insertions(+), 17 deletions(-) diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 932b8b7b38c5..2f761057bea5 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -1,10 +1,10 @@ -from typing import (TYPE_CHECKING, Dict, List, Literal, Optional, Sequence, +from typing import (TYPE_CHECKING, List, Literal, Optional, Sequence, TypedDict, Union, cast, overload) from typing_extensions import NotRequired if TYPE_CHECKING: - from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData + from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData class ParsedText(TypedDict): @@ -136,7 +136,7 @@ class LLMInputs(TypedDict): The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]]] + multi_modal_data: NotRequired[Optional["ExternalMultiModalDataDict"]] """ Optional multi-modal data to pass to the model, if the model supports it. diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index a4ccfea0783e..a9bd58a29549 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,4 @@ -from .base import EXTERNAL_MM_DATA_TYPE, MultiModalData, MultiModalPlugin +from .base import ExternalMultiModalDataDict, MultiModalData, MultiModalPlugin from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -15,5 +15,5 @@ "MultiModalPlugin", "MULTIMODAL_REGISTRY", "MultiModalRegistry", - "EXTERNAL_MM_DATA_TYPE", + "ExternalMultiModalDataDict", ] diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 6e1bf2fea385..5a2e28cd7a39 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Tuple, - Type, TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, + Tuple, Type, TypedDict, TypeVar, Union) from vllm.config import ModelConfig from vllm.inputs import InputContext @@ -18,6 +18,8 @@ class MultiModalData: """ Base class that contains multi-modal data. + This is for internal use. + To add a new modality, add a new file under ``multimodal`` directory. In this new file, subclass :class:`~MultiModalData` and @@ -34,7 +36,14 @@ class MultiModalData: D = TypeVar("D", bound=MultiModalData) N = TypeVar("N", bound=Type["nn.Module"]) -EXTERNAL_MM_DATA_TYPE = Union["Image.Image", "torch.Tensor"] + +class ExternalMultiModalDataBuiltins(TypedDict, total=False): + image: Union["Image.Image", "torch.Tensor"] + + +ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str, + Any]] + MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]] """Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers @@ -65,8 +74,7 @@ def get_internal_data_type(self) -> Type[D]: raise NotImplementedError @abstractmethod - def get_external_data_type( - self) -> Tuple[str, Type[EXTERNAL_MM_DATA_TYPE]]: + def get_external_data_type(self) -> Tuple[str, Type[Any]]: """The data type that this plugin handles. For `LLM.generate(multi_modal_data={"key": value})` will diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index d1fa5c1f58bf..a0bd960705e3 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,12 +1,12 @@ import functools -from typing import Any, Dict, Optional, Sequence, Type, TypeVar, Union +from typing import Any, Optional, Sequence, Type, TypeVar, Union from torch import nn from vllm.config import ModelConfig from vllm.logger import init_logger -from .base import (EXTERNAL_MM_DATA_TYPE, MultiModalData, +from .base import (ExternalMultiModalDataDict, MultiModalData, MultiModalInputMapper, MultiModalPlugin) from .image import ImageData, ImagePlugin @@ -113,8 +113,7 @@ def register_image_input(self, return self.register_input_mapper(ImageData, mapper) def map_input(self, model_config: ModelConfig, - data: Union[MultiModalData, Dict[str, - EXTERNAL_MM_DATA_TYPE]]): + data: Union[MultiModalData, ExternalMultiModalDataDict]): """ Apply an input mapper to a :class:`~MultiModalData` instance passed to the model. diff --git a/vllm/sequence.py b/vllm/sequence.py index 5cf4a71ea403..549810e8f0a3 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from vllm.inputs import LLMInputs - from vllm.multimodal import EXTERNAL_MM_DATA_TYPE, MultiModalData + from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -258,7 +258,7 @@ def prompt_token_ids(self) -> List[int]: return self.inputs["prompt_token_ids"] @property - def multi_modal_data(self) -> Dict[str, "EXTERNAL_MM_DATA_TYPE"]: + def multi_modal_data(self) -> "ExternalMultiModalDataDict": return self.inputs.get("multi_modal_data") or {} @property @@ -617,7 +617,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, - multi_modal_data: Optional[Dict[str, "EXTERNAL_MM_DATA_TYPE"]] = None, + multi_modal_data: Optional["ExternalMultiModalDataDict"] = None, encoder_seq_data: Optional[SequenceData] = None, cross_block_table: Optional[List[int]] = None, ) -> None: From 2144d3a013b23f15331a22c55b90f203f7d13545 Mon Sep 17 00:00:00 2001 From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Date: Fri, 28 Jun 2024 12:57:40 -0700 Subject: [PATCH 08/21] mention schema --- docs/source/models/vlm.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index d4613be64aa5..639e7a9a9284 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This is a dictionary that contains multi-modal data. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`ExternalMultiModalDataDict`. .. code-block:: python From d432934f824b1d71a822b03538e8c6f2f020f1f3 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Sun, 30 Jun 2024 18:58:31 -0700 Subject: [PATCH 09/21] address comments Signed-off-by: Xiaowei Jiang --- docs/source/models/vlm.rst | 2 +- tests/conftest.py | 8 --- vllm/entrypoints/openai/serving_chat.py | 69 +++++++++++------------- vllm/model_executor/models/llava.py | 8 --- vllm/model_executor/models/llava_next.py | 3 ++ vllm/multimodal/utils.py | 7 +-- 6 files changed, 37 insertions(+), 60 deletions(-) diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 639e7a9a9284..d4bb86ecf576 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`ExternalMultiModalDataDict`. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.ExternalMultiModalDataDict`. .. code-block:: python diff --git a/tests/conftest.py b/tests/conftest.py index eff34e9d2937..54c250692df5 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -46,14 +46,6 @@ def _read_prompts(filename: str) -> List[str]: class ImageAsset: name: Literal["stop_sign", "cherry_blossom"] - @cached_property - def pixel_values(self) -> torch.Tensor: - return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt") - - @cached_property - def image_features(self) -> torch.Tensor: - return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt") - @cached_property def pil_image(self) -> Image.Image: return Image.open(_IMAGE_DIR / f"{self.name}.jpg") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index bd0d82545ca5..a5ee08c57a09 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -26,9 +26,8 @@ from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) -from vllm.multimodal.image import ImageData -from vllm.multimodal.utils import (async_get_and_parse_image, - get_full_image_text_prompt) +from vllm.multimodal import ExternalMultiModalDataDict +from vllm.multimodal.utils import get_full_image_text_prompt, ImageFetchAiohttp from vllm.outputs import RequestOutput from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -47,7 +46,7 @@ class ConversationMessage(TypedDict): @dataclass(frozen=True) class ChatMessageParseResult: messages: List[ConversationMessage] - image_futures: List[Awaitable[ImageData]] = field(default_factory=list) + mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(default_factory=list) class OpenAIServingChat(OpenAIServing): @@ -102,7 +101,7 @@ def _parse_chat_message_content_parts( parts: Iterable[ChatCompletionContentPartParam], ) -> ChatMessageParseResult: texts: List[str] = [] - image_futures: List[Awaitable[ImageData]] = [] + mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = [] vlm_config: Optional[VisionLanguageConfig] = getattr( self.engine.engine, "vision_language_config", None) @@ -112,39 +111,36 @@ def _parse_chat_message_content_parts( part_type = part["type"] if part_type == "text": text = cast(ChatCompletionContentPartTextParam, part)["text"] - texts.append(text) elif part_type == "image_url": if vlm_config is None: raise ValueError( "'image_url' input is not supported as the loaded " "model is not multimodal.") + assert self.tokenizer is not None + image_url = cast(ChatCompletionContentPartImageParam, + part)["image_url"] - elif len(image_futures) == 0: - assert self.tokenizer is not None - image_url = cast(ChatCompletionContentPartImageParam, - part)["image_url"] - - if image_url.get("detail", "auto") != "auto": - logger.warning( - "'image_url.detail' is currently not supported and " - "will be ignored.") + if image_url.get("detail", "auto") != "auto": + logger.warning( + "'image_url.detail' is currently not supported and " + "will be ignored.") - image_future = async_get_and_parse_image(image_url["url"]) - image_futures.append(image_future) + async def async_get_and_parse_image(image_url: str): + with await ImageFetchAiohttp.fetch_image(image_url) as image: + return {"image": image} - else: - raise NotImplementedError( - "Multiple 'image_url' input is currently not supported." - ) + mm_future = async_get_and_parse_image(image_url["url"]) + mm_futures.append(mm_future) else: raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) - if vlm_config is not None and len(image_futures): - + if vlm_config is not None and len(mm_futures): + + assert len(mm_futures) == 1, "Multiple images is not supported." (image_token_prompt, image_token_str) = vlm_config.get_image_token_text(self.tokenizer) @@ -171,7 +167,7 @@ def _parse_chat_message_content_parts( messages = [ConversationMessage(role=role, content=text_prompt)] return ChatMessageParseResult(messages=messages, - image_futures=image_futures) + mm_futures=mm_futures) def _parse_chat_message_content( self, @@ -181,10 +177,10 @@ def _parse_chat_message_content( content = message.get("content") if content is None: - return ChatMessageParseResult(messages=[], image_futures=[]) + return ChatMessageParseResult(messages=[], mm_futures=[]) if isinstance(content, str): messages = [ConversationMessage(role=role, content=content)] - return ChatMessageParseResult(messages=messages, image_futures=[]) + return ChatMessageParseResult(messages=messages, mm_futures=[]) return self._parse_chat_message_content_parts(role, content) @@ -209,13 +205,13 @@ async def create_chat_completion( try: conversation: List[ConversationMessage] = [] - image_futures: List[Awaitable[ImageData]] = [] + mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = [] for msg in request.messages: chat_parsed_result = self._parse_chat_message_content(msg) conversation.extend(chat_parsed_result.messages) - image_futures.extend(chat_parsed_result.image_futures) + mm_futures.extend(chat_parsed_result.mm_futures) prompt = self.tokenizer.apply_chat_template( conversation=conversation, @@ -226,15 +222,14 @@ async def create_chat_completion( logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) - # Fetch image data - image_data: Optional[ImageData] = None + mm_data: Optional[ExternalMultiModalDataDict] = None try: - if len(image_futures): - # since we support only single image currently - assert len(image_futures) == 1 - image_data = await image_futures[0] + if len(mm_futures): + # since we support only single mm data currently + assert len(mm_futures) == 1 + mm_data = await mm_futures[0] except Exception as e: - logger.error("Error in loading image data: %s", e) + logger.error("Error in loading multi-modal data: %s", e) return self.create_error_response(str(e)) request_id = f"cmpl-{random_uuid()}" @@ -265,8 +260,8 @@ async def create_chat_completion( "prompt": prompt_text, "prompt_token_ids": prompt_ids, } - if image_data is not None: - inputs["multi_modal_data"] = image_data + if mm_data is not None: + inputs["multi_modal_data"] = mm_data is_tracing_enabled = await self.engine.is_tracing_enabled() trace_headers = None diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 72d39bb124ef..78823a513488 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -228,14 +228,6 @@ def forward( input_ids: Flattened (concatenated) input_ids corresponding to a batch. pixel_values: The pixels in each input image. - Expects a batch with shape `[1, 3, 336, 336]`. - (Only applicable to `PIXEL_VALUES` mode) - - See also: - Each input maps to huggingface implementation, as follows: - - - `pixel_values`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L360 - - `image_features`: https://github.com/huggingface/transformers/blob/v4.41.1/src/transformers/models/llava/modeling_llava.py#L437 """ image_input = self._parse_and_validate_image_input(**kwargs) diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index dfdf512b968a..e18198720ed9 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -232,6 +232,9 @@ def _parse_and_validate_image_input( pixel_values = kwargs.pop("pixel_values", None) image_sizes = kwargs.pop("image_sizes", None) + if pixel_values is None or image_sizes is None: + return None + if not isinstance(pixel_values, torch.Tensor): raise ValueError("Incorrect type of pixel values. " f"Got type: {type(pixel_values)}") diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index c80cd7adde58..aad9822db78d 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -8,7 +8,6 @@ from vllm.config import ModelConfig from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT -from vllm.multimodal.image import ImageData class ImageFetchAiohttp: @@ -53,14 +52,10 @@ async def fetch_image(cls, image_url: str) -> Image.Image: "Invalid 'image_url': A valid 'image_url' must start " "with either 'data:image' or 'http'.") + image.load() return image -async def async_get_and_parse_image(image_url: str) -> ImageData: - with await ImageFetchAiohttp.fetch_image(image_url) as image: - return ImageData(image) - - def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str: """Encode a pillow image to base64 format.""" From ab347bc649c2867114d6e7c0d044ea795bc895a6 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Sun, 30 Jun 2024 19:11:36 -0700 Subject: [PATCH 10/21] format Signed-off-by: Xiaowei Jiang --- vllm/entrypoints/openai/serving_chat.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index a5ee08c57a09..8c844f83077d 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -27,7 +27,7 @@ from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.multimodal import ExternalMultiModalDataDict -from vllm.multimodal.utils import get_full_image_text_prompt, ImageFetchAiohttp +from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt from vllm.outputs import RequestOutput from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -46,7 +46,8 @@ class ConversationMessage(TypedDict): @dataclass(frozen=True) class ChatMessageParseResult: messages: List[ConversationMessage] - mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field(default_factory=list) + mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field( + default_factory=list) class OpenAIServingChat(OpenAIServing): @@ -119,7 +120,7 @@ def _parse_chat_message_content_parts( "model is not multimodal.") assert self.tokenizer is not None image_url = cast(ChatCompletionContentPartImageParam, - part)["image_url"] + part)["image_url"] if image_url.get("detail", "auto") != "auto": logger.warning( @@ -127,7 +128,8 @@ def _parse_chat_message_content_parts( "will be ignored.") async def async_get_and_parse_image(image_url: str): - with await ImageFetchAiohttp.fetch_image(image_url) as image: + with await ImageFetchAiohttp.fetch_image(image_url + ) as image: return {"image": image} mm_future = async_get_and_parse_image(image_url["url"]) @@ -139,7 +141,7 @@ async def async_get_and_parse_image(image_url: str): text_prompt = "\n".join(texts) if vlm_config is not None and len(mm_futures): - + assert len(mm_futures) == 1, "Multiple images is not supported." (image_token_prompt, image_token_str) = vlm_config.get_image_token_text(self.tokenizer) @@ -166,8 +168,7 @@ async def async_get_and_parse_image(image_url: str): else: messages = [ConversationMessage(role=role, content=text_prompt)] - return ChatMessageParseResult(messages=messages, - mm_futures=mm_futures) + return ChatMessageParseResult(messages=messages, mm_futures=mm_futures) def _parse_chat_message_content( self, From 404700f8e4d683c3ae3006be9f31c5d3e31bd68c Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Sun, 30 Jun 2024 20:55:58 -0700 Subject: [PATCH 11/21] rm ctx Signed-off-by: Xiaowei Jiang --- vllm/entrypoints/openai/serving_chat.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 8c844f83077d..55d5c383a6ee 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -128,9 +128,8 @@ def _parse_chat_message_content_parts( "will be ignored.") async def async_get_and_parse_image(image_url: str): - with await ImageFetchAiohttp.fetch_image(image_url - ) as image: - return {"image": image} + image = await ImageFetchAiohttp.fetch_image(image_url) + return {"image": image} mm_future = async_get_and_parse_image(image_url["url"]) mm_futures.append(mm_future) From 04ebb6809a77e81bd41c7a9565b8ba2a522d98b1 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 08:18:51 -0700 Subject: [PATCH 12/21] rm ImageData and MultiModalData Signed-off-by: Xiaowei Jiang --- .../dev/multimodal/multimodal_index.rst | 2 +- examples/openai_vision_api_client.py | 1 - tests/conftest.py | 15 +-- tests/models/test_llava.py | 4 +- tests/models/test_llava_next.py | 4 +- tests/models/test_phi3v.py | 4 +- tests/multimodal/test_mapper.py | 9 +- tests/spec_decode/e2e/conftest.py | 4 +- vllm/inputs/data.py | 8 +- vllm/inputs/registry.py | 7 +- vllm/model_executor/models/clip.py | 5 +- vllm/model_executor/models/llava.py | 7 +- vllm/model_executor/models/llava_next.py | 38 +++--- vllm/model_executor/models/phi3v.py | 15 ++- vllm/multimodal/__init__.py | 3 +- vllm/multimodal/base.py | 57 +++------ vllm/multimodal/image.py | 39 ++---- vllm/multimodal/registry.py | 114 +++++++----------- vllm/sequence.py | 4 +- 19 files changed, 126 insertions(+), 214 deletions(-) diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index f6fdfc1debff..d551eb899acc 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -29,7 +29,7 @@ Registry Base Classes ------------ -.. autoclass:: vllm.multimodal.MultiModalData +.. autoclass:: vllm.multimodal.ExternalMultiModalDataDict :members: :show-inheritance: diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py index 26f2aa651fca..fcda1345f576 100644 --- a/examples/openai_vision_api_client.py +++ b/examples/openai_vision_api_client.py @@ -3,7 +3,6 @@ Launch the vLLM server with the following command: python -m vllm.entrypoints.openai.api_server \ --model llava-hf/llava-1.5-7b-hf \ - --image-input-type pixel_values \ --image-token-id 32000 \ --image-input-shape 1,3,336,336 \ --image-feature-size 576 \ diff --git a/tests/conftest.py b/tests/conftest.py index 43609f257da9..dca87149ce32 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,8 @@ from dataclasses import dataclass from functools import cached_property from pathlib import Path -from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, - TypedDict, TypeVar) +from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, + TypeVar) import pytest import torch @@ -22,12 +22,7 @@ destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger - -if TYPE_CHECKING: - from vllm.multimodal import MultiModalData -else: - # it will call torch.cuda.device_count() - MultiModalData = None +from vllm.multimodal import ExternalMultiModalDataDict from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu @@ -433,7 +428,7 @@ def generate( self, prompts: List[str], sampling_params: SamplingParams, - images: Optional[List[MultiModalData]] = None, + images: Optional[List[ExternalMultiModalDataDict]] = None, ) -> List[Tuple[List[List[int]], List[str]]]: if images is not None: assert len(prompts) == len(images) @@ -482,7 +477,7 @@ def generate_greedy( self, prompts: List[str], max_tokens: int, - images: Optional[List[MultiModalData]] = None, + images: Optional[List[ExternalMultiModalDataDict]] = None, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index f2bac4235d8f..7f4a3a597874 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -75,8 +75,8 @@ def run_test( All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - vision language config as input. + For vllm runner, we provide ExternalMultiModalDataDict objects + and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index b7dc3e8ef796..b03e00923ca2 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -82,8 +82,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - vision language config as input. + For vllm runner, we provide ExternalMultiModalDataDict objects + and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 7297afb4a1c9..5809e6f83755 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -84,8 +84,8 @@ def run_test( All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide MultiModalData objects and corresponding - vision language config as input. + For vllm runner, we provide ExternalMultiModalDataDict objects + and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ diff --git a/tests/multimodal/test_mapper.py b/tests/multimodal/test_mapper.py index 2238b0e9f9f7..bdbbd9abfc5c 100644 --- a/tests/multimodal/test_mapper.py +++ b/tests/multimodal/test_mapper.py @@ -4,7 +4,6 @@ from vllm.config import ModelConfig from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import ImageData from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE @@ -33,7 +32,7 @@ def test_clip_image_processor(image_assets, dtype): ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.map_input( model_config, - ImageData(asset.pil_image), + {"image": asset.pil_image}, ) assert hf_result.keys() == vllm_result.keys() @@ -72,7 +71,7 @@ def test_llava_next_image_processor(image_assets, dtype): ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype]) vllm_result = MULTIMODAL_REGISTRY.map_input( model_config, - ImageData(asset.pil_image), + {"image": asset.pil_image}, ) assert hf_result.keys() == vllm_result.keys() @@ -102,11 +101,11 @@ def test_image_pixel_types(image_assets, dtype): for asset in image_assets: image_result = MULTIMODAL_REGISTRY.map_input( model_config, - ImageData(asset.pil_image), + {"image": asset.pil_image}, ) tensor_result = MULTIMODAL_REGISTRY.map_input( model_config, - ImageData(asset.pil_image), + {"image": asset.pil_image}, ) assert image_result.keys() == tensor_result.keys() diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index 60dfe33f2918..f75caef6a5b5 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -11,7 +11,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.lora.request import LoRARequest from vllm.model_executor.utils import set_random_seed -from vllm.multimodal import MultiModalData +from vllm.multimodal import ExternalMultiModalDataDict from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob @@ -91,7 +91,7 @@ def generate( prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[MultiModalData] = None, + multi_modal_data: Optional[ExternalMultiModalDataDict] = None, ) -> List[RequestOutput]: if prompts is None: diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index 2f761057bea5..df8c38ead21a 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -4,7 +4,7 @@ from typing_extensions import NotRequired if TYPE_CHECKING: - from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData + from vllm.multimodal import ExternalMultiModalDataDict class ParsedText(TypedDict): @@ -72,7 +72,7 @@ class TextPrompt(TypedDict): prompt: str """The input text to be tokenized before passing to the model.""" - multi_modal_data: NotRequired["MultiModalData"] + multi_modal_data: NotRequired["ExternalMultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. @@ -85,7 +85,7 @@ class TokensPrompt(TypedDict): prompt_token_ids: List[int] """A list of token IDs to pass to the model.""" - multi_modal_data: NotRequired["MultiModalData"] + multi_modal_data: NotRequired["ExternalMultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. @@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict): prompt_token_ids: List[int] """The token IDs of the prompt.""" - multi_modal_data: NotRequired["MultiModalData"] + multi_modal_data: NotRequired["ExternalMultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 8f4e108b8cca..85552db4aefe 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from vllm.config import ModelConfig, VisionLanguageConfig - from vllm.multimodal import MultiModalData + from vllm.multimodal import ExternalMultiModalDataDict from vllm.sequence import SequenceData logger = init_logger(__name__) @@ -66,7 +66,8 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C: N = TypeVar("N", bound=Type[nn.Module]) DummyDataFactory = Callable[[InputContext, int], - Tuple["SequenceData", Optional["MultiModalData"]]] + Tuple["SequenceData", + Optional["ExternalMultiModalDataDict"]]] """ Create dummy data to be inputted into the model. @@ -94,7 +95,7 @@ def _default_dummy_data_factory( self, ctx: InputContext, seq_len: int, - ) -> Tuple["SequenceData", Optional["MultiModalData"]]: + ) -> Tuple["SequenceData", Optional["ExternalMultiModalDataDict"]]: """ The default dummy data factory represents the longest possible text that can be inputted to the model. diff --git a/vllm/model_executor/models/clip.py b/vllm/model_executor/models/clip.py index f6dfbf17a792..5212e2808fb3 100644 --- a/vllm/model_executor/models/clip.py +++ b/vllm/model_executor/models/clip.py @@ -12,7 +12,6 @@ from vllm.model_executor.layers.linear import (ColumnParallelLinear, RowParallelLinear) from vllm.model_executor.layers.quantization import QuantizationConfig -from vllm.multimodal.image import ImageData from vllm.sequence import SequenceData @@ -49,7 +48,7 @@ def dummy_seq_data_for_clip( return SequenceData(token_ids) -def dummy_pixel_data_for_clip( +def dummy_image_for_clip( hf_config: CLIPVisionConfig, *, image_width_override: Optional[int] = None, @@ -62,7 +61,7 @@ def dummy_pixel_data_for_clip( height = image_height_override image = Image.new("RGB", (width, height), color=0) - return ImageData(image) + return {"image": image} # Adapted from https://github.com/huggingface/transformers/blob/v4.39.0/src/transformers/models/clip/modeling_clip.py#L164 # noqa diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 8c3926e4a5f0..5e48c5530a0c 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -17,10 +17,10 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SamplerOutput -from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip +from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsVision _KEYS_TO_MODIFY_MAPPING = { @@ -89,8 +89,7 @@ def dummy_data_for_llava(ctx: InputContext, seq_len: int): image_token_id=hf_config.image_token_index, ) - mm_data: MultiModalData - mm_data = dummy_pixel_data_for_clip(vision_config) + mm_data = dummy_image_for_clip(vision_config) return seq_data, mm_data msg = f"Unsupported vision config: {type(vision_config)}" diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index c113ead67b05..3c0988137f7c 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -2,6 +2,7 @@ import torch import torch.nn as nn +from PIL import Image from transformers import CLIPVisionConfig, LlavaNextConfig from transformers.models.llava_next.modeling_llava_next import ( get_anyres_image_grid_shape, unpad_image) @@ -20,11 +21,10 @@ from vllm.model_executor.models.clip import CLIPVisionModel from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData -from vllm.multimodal.image import ImageData +from vllm.multimodal import MULTIMODAL_REGISTRY from vllm.sequence import SamplerOutput -from .clip import (dummy_pixel_data_for_clip, dummy_seq_data_for_clip, +from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_patch_grid_length) from .interfaces import SupportsVision from .llava import LlavaMultiModalProjector, merge_vision_embeddings @@ -127,8 +127,7 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int): image_feature_size_override=image_feature_size, ) - mm_data: MultiModalData - mm_data = dummy_pixel_data_for_clip( + mm_data = dummy_image_for_clip( vision_config, image_width_override=dummy_width, image_height_override=dummy_height, @@ -140,28 +139,23 @@ def dummy_data_for_llava_next(ctx: InputContext, seq_len: int): raise NotImplementedError(msg) -def _pixel_mapper(ctx: InputContext, - data: ImageData) -> Dict[str, torch.Tensor]: - image = data.image +def _pixel_mapper(ctx: InputContext, image: object) -> Dict[str, torch.Tensor]: - if isinstance(image, torch.Tensor): - pixel_values = image.to(ctx.model_config.dtype) - batch_size, _, _, h, w = pixel_values.shape - image_sizes = torch.tensor([(w, h) for _ in range(batch_size)]) + if isinstance(image, Image.Image): - return {"pixel_values": pixel_values, "image_sizes": image_sizes} + # Temporary patch before dynamic number of image tokens is supported + _, _, h, w = ctx.get_multimodal_config().image_input_shape + if (w, h) != (image.width, image.height): + logger.warning( + "Dynamic image shape is currently not supported. " + "Resizing input image to (%d, %d).", w, h) - # Temporary patch before dynamic number of image tokens is supported - _, _, h, w = ctx.get_multimodal_config().image_input_shape - if (w, h) != (image.width, image.height): - logger.warning( - "Dynamic image shape is currently not supported. " - "Resizing input image to (%d, %d).", w, h) + image = image.resize((w, h)) - data.image = image.resize((w, h)) + return MULTIMODAL_REGISTRY._get_plugin("image") \ + ._default_input_mapper(ctx, image) - return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \ - ._default_input_mapper(ctx, data) + raise TypeError(f"Invalid type for 'image': {type(image)}") @MULTIMODAL_REGISTRY.register_image_input_mapper(_pixel_mapper) diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5e53fed9aa92..a16f7f0ea570 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -35,10 +35,9 @@ from vllm.model_executor.models.llama import LlamaModel from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.image import ImageData from vllm.sequence import SamplerOutput -from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip +from .clip import dummy_image_for_clip, dummy_seq_data_for_clip from .interfaces import SupportsVision logger = init_logger(__name__) @@ -286,7 +285,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int): image_token_id=32044, image_feature_size_override=image_feature_size, ) - mm_data = dummy_pixel_data_for_clip( + mm_data = dummy_image_for_clip( CLIP_VIT_LARGE_PATCH14_336_CONFIG, image_width_override=dummy_width, image_height_override=dummy_height, @@ -331,8 +330,7 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16): def _image_processor(ctx: InputContext, - data: ImageData) -> Dict[str, torch.Tensor]: - image = data.image + image: object) -> Dict[str, torch.Tensor]: if isinstance(image, Image.Image): # Temporary patch before dynamic number of image tokens is supported @@ -343,10 +341,11 @@ def _image_processor(ctx: InputContext, "Dynamic image shape is currently not supported. " "Resizing input image to (%d, %d).", w, h) - data.image = image.resize((w, h)) + image = image.resize((w, h)) - return MULTIMODAL_REGISTRY._get_plugin_for_internal_data_type(ImageData) \ - ._default_input_mapper(ctx, data) + return MULTIMODAL_REGISTRY._get_plugin("image") \ + ._default_input_mapper(ctx, image) + raise TypeError(f"Invalid type for 'image': {type(image)}") @MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor) diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index a9bd58a29549..ebbad488ecdc 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,4 @@ -from .base import ExternalMultiModalDataDict, MultiModalData, MultiModalPlugin +from .base import ExternalMultiModalDataDict, MultiModalPlugin from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -11,7 +11,6 @@ """ __all__ = [ - "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY", "MultiModalRegistry", diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 5a2e28cd7a39..8222e94c3c85 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -1,6 +1,6 @@ from abc import ABC, abstractmethod -from typing import (TYPE_CHECKING, Any, Callable, Dict, Generic, Optional, - Tuple, Type, TypedDict, TypeVar, Union) +from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Type, + TypedDict, TypeVar, Union) from vllm.config import ModelConfig from vllm.inputs import InputContext @@ -13,27 +13,6 @@ logger = init_logger(__name__) - -class MultiModalData: - """ - Base class that contains multi-modal data. - - This is for internal use. - - To add a new modality, add a new file under ``multimodal`` directory. - - In this new file, subclass :class:`~MultiModalData` and - :class:`~MultiModalPlugin`. - - Finally, register the new plugin to - :const:`vllm.multimodal.MULTIMODAL_REGISTRY` (beyond default plugins). - This enables models to call :meth:`MultiModalRegistry.map_input` for - the new modality. - """ - pass - - -D = TypeVar("D", bound=MultiModalData) N = TypeVar("N", bound=Type["nn.Module"]) @@ -44,13 +23,14 @@ class ExternalMultiModalDataBuiltins(TypedDict, total=False): ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str, Any]] -MultiModalInputMapper = Callable[[InputContext, D], Dict[str, "torch.Tensor"]] +MultiModalInputMapper = Callable[[InputContext, object], Dict[str, + "torch.Tensor"]] """Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers and processors in HuggingFace Transformers.""" -class MultiModalPlugin(ABC, Generic[D]): +class MultiModalPlugin(ABC): """ Base class that defines data processing logic for a specific modality. @@ -63,29 +43,18 @@ class MultiModalPlugin(ABC, Generic[D]): def __init__(self) -> None: self._input_mappers: Dict[Type["nn.Module"], - MultiModalInputMapper[D]] = {} + MultiModalInputMapper] = {} @abstractmethod - def get_internal_data_type(self) -> Type[D]: + def get_data_key(self) -> str: """ - Get the modality (subclass of :class:`~MultiModalData`) served by - this plugin. - """ - raise NotImplementedError - - @abstractmethod - def get_external_data_type(self) -> Tuple[str, Type[Any]]: - """The data type that this plugin handles. - - For `LLM.generate(multi_modal_data={"key": value})` will - be handled by plugin with an external data type of - (key, type(value)). + Get the data key corresponding to the modality. """ raise NotImplementedError @abstractmethod def _default_input_mapper(self, ctx: InputContext, - data: D) -> Dict[str, "torch.Tensor"]: + data: object) -> Dict[str, "torch.Tensor"]: """Return a dictionary to be passed as keyword arguments to :meth:`~torch.nn.Module.forward`. This is similar in concept to tokenizers and processors in HuggingFace Transformers. @@ -94,7 +63,7 @@ def _default_input_mapper(self, ctx: InputContext, def register_input_mapper( self, - mapper: Optional[MultiModalInputMapper[D]] = None, + mapper: Optional[MultiModalInputMapper] = None, ): """ Register an input mapper to a model class. @@ -122,11 +91,13 @@ def wrapper(model_cls: N) -> N: return wrapper def map_input(self, model_config: ModelConfig, - data: D) -> Dict[str, "torch.Tensor"]: + data: object) -> Dict[str, "torch.Tensor"]: """ - Apply an input mapper to a :class:`~MultiModalData` instance passed + Apply an input mapper to a data passed to the model, transforming the data into a dictionary of model inputs. + If the data is not something that the mapper expects, throws TypeError. + The model is identified by ``model_config``. TODO: Add guide [ref: PR #5276] diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py index d8d8ee4170eb..a0b4206bf2ee 100644 --- a/vllm/multimodal/image.py +++ b/vllm/multimodal/image.py @@ -1,5 +1,5 @@ from functools import lru_cache -from typing import Dict, Tuple, Type +from typing import Dict import torch from PIL import Image @@ -9,35 +9,17 @@ from vllm.logger import init_logger from vllm.transformers_utils.image_processor import get_image_processor -from .base import MultiModalData, MultiModalPlugin +from .base import MultiModalPlugin logger = init_logger(__name__) cached_get_image_processor = lru_cache(get_image_processor) -class ImageData(MultiModalData): - """ - Contains a :class:`PIL.Image.Image` object. Requires that a HuggingFace - processor is available to the model. - """ +class ImagePlugin(MultiModalPlugin): - def __init__(self, image: Image.Image) -> None: - # So that this class can be created inside the Image context manager - image.load() - self.image = image - - def __repr__(self) -> str: - return f"{type(self).__name__}(image={self.image})" - - -class ImagePlugin(MultiModalPlugin[ImageData]): - - def get_internal_data_type(self) -> Type[ImageData]: - return ImageData - - def get_external_data_type(self) -> Tuple[str, Type[Image.Image]]: - return ("image", Image.Image) + def get_data_key(self) -> str: + return "image" def _get_hf_image_processor(self, model_config: ModelConfig): return cached_get_image_processor( @@ -45,19 +27,18 @@ def _get_hf_image_processor(self, model_config: ModelConfig): trust_remote_code=model_config.trust_remote_code) def _default_input_mapper(self, ctx: InputContext, - data: ImageData) -> Dict[str, torch.Tensor]: + data: object) -> Dict[str, torch.Tensor]: model_config = ctx.model_config - image = data.image - if isinstance(image, Image.Image): + if isinstance(data, Image.Image): image_processor = self._get_hf_image_processor(model_config) if image_processor is None: raise RuntimeError("No HuggingFace processor is available" "to process the image object") try: - return image_processor.preprocess(image, return_tensors="pt") \ + return image_processor.preprocess(data, return_tensors="pt") \ .to(model_config.dtype).data except Exception: - logger.error("Failed to process image (%s)", image) + logger.error("Failed to process image (%s)", data) raise - raise TypeError(f"Invalid image type: {type(image)}") + raise TypeError(f"Invalid type for 'image': {type(data)}") diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index a0bd960705e3..d5a78d851917 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -1,18 +1,17 @@ import functools -from typing import Any, Optional, Sequence, Type, TypeVar, Union +from typing import Optional, Sequence, Type, TypeVar from torch import nn from vllm.config import ModelConfig from vllm.logger import init_logger -from .base import (ExternalMultiModalDataDict, MultiModalData, - MultiModalInputMapper, MultiModalPlugin) -from .image import ImageData, ImagePlugin +from .base import (ExternalMultiModalDataDict, MultiModalInputMapper, + MultiModalPlugin) +from .image import ImagePlugin logger = init_logger(__name__) -D = TypeVar("D", bound=MultiModalData) N = TypeVar("N", bound=Type[nn.Module]) @@ -26,109 +25,86 @@ class MultiModalRegistry: DEFAULT_PLUGINS = (ImagePlugin(), ) - def __init__(self, - *, - plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS - ) -> None: - self._plugins_by_internal_data_type = { - p.get_internal_data_type(): p - for p in plugins - } - self._plugins_by_external_data_type = { - p.get_external_data_type(): p - for p in plugins - } - - def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None: - data_type = plugin.get_internal_data_type() - - if data_type in self._plugins_by_internal_data_type: + def __init__( + self, + *, + plugins: Sequence[MultiModalPlugin] = DEFAULT_PLUGINS) -> None: + self._plugins = {p.get_data_key(): p for p in plugins} + + def register_plugin(self, plugin: MultiModalPlugin) -> None: + data_type_key = plugin.get_data_key() + + if data_type_key in self._plugins: logger.warning( "A plugin is already registered for data type %s, " - "and will be overwritten by the new plugin %s.", data_type, + "and will be overwritten by the new plugin %s.", data_type_key, plugin) - self._plugins_by_internal_data_type[data_type] = plugin + self._plugins[data_type_key] = plugin + + def _get_plugin(self, data_type_key: str): + plugin = self._plugins.get(data_type_key) + if plugin is not None: + return plugin + + msg = f"Unknown multi-modal data type: {data_type_key}" + raise NotImplementedError(msg) def register_image_input_mapper( self, - mapper: Optional[MultiModalInputMapper[ImageData]] = None, + mapper: Optional[MultiModalInputMapper] = None, ): """ Register an input mapper for image pixel data to a model class. See :meth:`MultiModalPlugin.register_input_mapper` for more details. """ - return self.register_input_mapper(ImageData, mapper) + return self.register_input_mapper("image", mapper) def _process_external_input(self, key, value, model_config: ModelConfig): - plugin = self._get_plugin_for_external_data_type(key, type(value)) + plugin = self._plugins.get(key) if plugin: - return plugin.map_input(model_config, - plugin.get_internal_data_type()(value)) - msg = f"Unknown multi-modal data type: {type(value)}" - raise NotImplementedError(msg) - - def _get_plugin_for_external_data_type(self, key: str, - data_type: Type[Any]): - for typ in data_type.mro(): - plugin = self._plugins_by_external_data_type.get((key, typ)) - if plugin is not None: - return plugin - - msg = f"No plugin found for key {key} and type {data_type}" - raise NotImplementedError(msg) - - def _get_plugin_for_internal_data_type(self, - data_type: Type[MultiModalData]): - for typ in data_type.mro(): - plugin = self._plugins_by_internal_data_type.get(typ) - if plugin is not None: - return plugin - - msg = f"Unknown multi-modal data type: {data_type}" + return plugin.map_input(model_config, value) + msg = f"Unknown multi-modal data type: {key}" raise NotImplementedError(msg) def register_input_mapper( self, - data_type: Type[D], - mapper: Optional[MultiModalInputMapper[D]] = None, + data_type: str, + mapper: Optional[MultiModalInputMapper] = None, ): """ Register an input mapper for a specific modality to a model class. See :meth:`MultiModalPlugin.register_input_mapper` for more details. """ - return self._get_plugin_for_internal_data_type(data_type) \ - .register_input_mapper(mapper) + plugin = self._plugins.get(data_type) + if not plugin: + msg = f"Unknown multi-modal data type: {data_type}" + raise NotImplementedError(msg) + return plugin.register_input_mapper(mapper) def register_image_input(self, - mapper: Optional[ - MultiModalInputMapper[ImageData]] = None): + mapper: Optional[MultiModalInputMapper] = None): """ Register an input mapper for image pixel data to a model class. See :meth:`MultiModalPlugin.register_input_mapper` for more details. """ - return self.register_input_mapper(ImageData, mapper) + return self.register_input_mapper("image", mapper) def map_input(self, model_config: ModelConfig, - data: Union[MultiModalData, ExternalMultiModalDataDict]): + data: ExternalMultiModalDataDict): """ - Apply an input mapper to a :class:`~MultiModalData` instance passed - to the model. + Apply an input mapper to the data passed to the model. See :meth:`MultiModalPlugin.map_input` for more details. """ - if isinstance(data, MultiModalData): - return self._get_plugin_for_internal_data_type(type(data)) \ - .map_input(model_config, data) - else: - result_list = [ - self._process_external_input(k, v, model_config) - for k, v in data.items() - ] - return {k: v for d in result_list for k, v in d.items()} + result_list = [ + self._process_external_input(k, v, model_config) + for k, v in data.items() + ] + return {k: v for d in result_list for k, v in d.items()} def create_input_mapper(self, model_config: ModelConfig): """ diff --git a/vllm/sequence.py b/vllm/sequence.py index c26f778674e5..69d68a6312fe 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from vllm.inputs import LLMInputs - from vllm.multimodal import ExternalMultiModalDataDict, MultiModalData + from vllm.multimodal import ExternalMultiModalDataDict from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -432,7 +432,7 @@ def prompt_token_ids(self) -> List[int]: return next(iter(self.seqs_dict.values())).prompt_token_ids @property - def multi_modal_data(self) -> Optional["MultiModalData"]: + def multi_modal_data(self) -> Optional["ExternalMultiModalDataDict"]: # All sequences in the group should have the same multi-modal data. # We use the multi-modal data of an arbitrary sequence. return next(iter(self.seqs_dict.values())).multi_modal_data From 31b8b09f66ca8412af1bf5d6a8cd14607aebc817 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 08:28:44 -0700 Subject: [PATCH 13/21] rm external Signed-off-by: Xiaowei Jiang --- docs/source/dev/multimodal/multimodal_index.rst | 2 +- docs/source/models/vlm.rst | 2 +- tests/conftest.py | 6 +++--- tests/models/test_llava.py | 2 +- tests/models/test_llava_next.py | 2 +- tests/models/test_phi3v.py | 2 +- tests/spec_decode/e2e/conftest.py | 4 ++-- vllm/entrypoints/openai/serving_chat.py | 10 +++++----- vllm/inputs/data.py | 10 +++++----- vllm/inputs/registry.py | 6 +++--- vllm/multimodal/__init__.py | 4 ++-- vllm/multimodal/base.py | 5 ++--- vllm/multimodal/registry.py | 11 ++++------- vllm/sequence.py | 8 ++++---- 14 files changed, 35 insertions(+), 39 deletions(-) diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index d551eb899acc..8aa6bd48d7b2 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -29,7 +29,7 @@ Registry Base Classes ------------ -.. autoclass:: vllm.multimodal.ExternalMultiModalDataDict +.. autoclass:: vllm.multimodal.MultiModalDataDict :members: :show-inheritance: diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index d4bb86ecf576..fe11af0b90e8 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -48,7 +48,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.ExternalMultiModalDataDict`. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.MultiModalDataDict`. .. code-block:: python diff --git a/tests/conftest.py b/tests/conftest.py index dca87149ce32..cc318ee46994 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -22,7 +22,7 @@ destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger -from vllm.multimodal import ExternalMultiModalDataDict +from vllm.multimodal import MultiModalDataDict from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu @@ -428,7 +428,7 @@ def generate( self, prompts: List[str], sampling_params: SamplingParams, - images: Optional[List[ExternalMultiModalDataDict]] = None, + images: Optional[List[MultiModalDataDict]] = None, ) -> List[Tuple[List[List[int]], List[str]]]: if images is not None: assert len(prompts) == len(images) @@ -477,7 +477,7 @@ def generate_greedy( self, prompts: List[str], max_tokens: int, - images: Optional[List[ExternalMultiModalDataDict]] = None, + images: Optional[List[MultiModalDataDict]] = None, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index 7f4a3a597874..c6313c52e4e3 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -75,7 +75,7 @@ def run_test( All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide ExternalMultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py index b03e00923ca2..e9babba13c47 100644 --- a/tests/models/test_llava_next.py +++ b/tests/models/test_llava_next.py @@ -82,7 +82,7 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide ExternalMultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 5809e6f83755..917bdbf94ab9 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -84,7 +84,7 @@ def run_test( All the image fixtures for the test is under tests/images. For huggingface runner, we provide the PIL images as input. - For vllm runner, we provide ExternalMultiModalDataDict objects + For vllm runner, we provide MultiModalDataDict objects and corresponding vision language config as input. Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py index f75caef6a5b5..8ad8e9cb81ff 100644 --- a/tests/spec_decode/e2e/conftest.py +++ b/tests/spec_decode/e2e/conftest.py @@ -11,7 +11,7 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine from vllm.lora.request import LoRARequest from vllm.model_executor.utils import set_random_seed -from vllm.multimodal import ExternalMultiModalDataDict +from vllm.multimodal import MultiModalDataDict from vllm.outputs import RequestOutput from vllm.sampling_params import SamplingParams from vllm.sequence import Logprob @@ -91,7 +91,7 @@ def generate( prompt_token_ids: Optional[List[List[int]]] = None, use_tqdm: bool = True, lora_request: Optional[LoRARequest] = None, - multi_modal_data: Optional[ExternalMultiModalDataDict] = None, + multi_modal_data: Optional[MultiModalDataDict] = None, ) -> List[RequestOutput]: if prompts is None: diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 55d5c383a6ee..033dcaf0724c 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -26,7 +26,7 @@ from vllm.logger import init_logger from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) -from vllm.multimodal import ExternalMultiModalDataDict +from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt from vllm.outputs import RequestOutput from vllm.sequence import Logprob @@ -46,7 +46,7 @@ class ConversationMessage(TypedDict): @dataclass(frozen=True) class ChatMessageParseResult: messages: List[ConversationMessage] - mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = field( + mm_futures: List[Awaitable[MultiModalDataDict]] = field( default_factory=list) @@ -102,7 +102,7 @@ def _parse_chat_message_content_parts( parts: Iterable[ChatCompletionContentPartParam], ) -> ChatMessageParseResult: texts: List[str] = [] - mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = [] + mm_futures: List[Awaitable[MultiModalDataDict]] = [] vlm_config: Optional[VisionLanguageConfig] = getattr( self.engine.engine, "vision_language_config", None) @@ -205,7 +205,7 @@ async def create_chat_completion( try: conversation: List[ConversationMessage] = [] - mm_futures: List[Awaitable[ExternalMultiModalDataDict]] = [] + mm_futures: List[Awaitable[MultiModalDataDict]] = [] for msg in request.messages: chat_parsed_result = self._parse_chat_message_content(msg) @@ -222,7 +222,7 @@ async def create_chat_completion( logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) - mm_data: Optional[ExternalMultiModalDataDict] = None + mm_data: Optional[MultiModalDataDict] = None try: if len(mm_futures): # since we support only single mm data currently diff --git a/vllm/inputs/data.py b/vllm/inputs/data.py index df8c38ead21a..c6381fcc01e5 100644 --- a/vllm/inputs/data.py +++ b/vllm/inputs/data.py @@ -4,7 +4,7 @@ from typing_extensions import NotRequired if TYPE_CHECKING: - from vllm.multimodal import ExternalMultiModalDataDict + from vllm.multimodal import MultiModalDataDict class ParsedText(TypedDict): @@ -72,7 +72,7 @@ class TextPrompt(TypedDict): prompt: str """The input text to be tokenized before passing to the model.""" - multi_modal_data: NotRequired["ExternalMultiModalDataDict"] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. @@ -85,7 +85,7 @@ class TokensPrompt(TypedDict): prompt_token_ids: List[int] """A list of token IDs to pass to the model.""" - multi_modal_data: NotRequired["ExternalMultiModalDataDict"] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. @@ -103,7 +103,7 @@ class TextTokensPrompt(TypedDict): prompt_token_ids: List[int] """The token IDs of the prompt.""" - multi_modal_data: NotRequired["ExternalMultiModalDataDict"] + multi_modal_data: NotRequired["MultiModalDataDict"] """ Optional multi-modal data to pass to the model, if the model supports it. @@ -136,7 +136,7 @@ class LLMInputs(TypedDict): The original prompt text corresponding to the token IDs, if available. """ - multi_modal_data: NotRequired[Optional["ExternalMultiModalDataDict"]] + multi_modal_data: NotRequired[Optional["MultiModalDataDict"]] """ Optional multi-modal data to pass to the model, if the model supports it. diff --git a/vllm/inputs/registry.py b/vllm/inputs/registry.py index 85552db4aefe..3e28733383cb 100644 --- a/vllm/inputs/registry.py +++ b/vllm/inputs/registry.py @@ -12,7 +12,7 @@ if TYPE_CHECKING: from vllm.config import ModelConfig, VisionLanguageConfig - from vllm.multimodal import ExternalMultiModalDataDict + from vllm.multimodal import MultiModalDataDict from vllm.sequence import SequenceData logger = init_logger(__name__) @@ -67,7 +67,7 @@ def get_hf_config(self, hf_config_type: Type[C]) -> C: DummyDataFactory = Callable[[InputContext, int], Tuple["SequenceData", - Optional["ExternalMultiModalDataDict"]]] + Optional["MultiModalDataDict"]]] """ Create dummy data to be inputted into the model. @@ -95,7 +95,7 @@ def _default_dummy_data_factory( self, ctx: InputContext, seq_len: int, - ) -> Tuple["SequenceData", Optional["ExternalMultiModalDataDict"]]: + ) -> Tuple["SequenceData", Optional["MultiModalDataDict"]]: """ The default dummy data factory represents the longest possible text that can be inputted to the model. diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py index ebbad488ecdc..256eadd2d7df 100644 --- a/vllm/multimodal/__init__.py +++ b/vllm/multimodal/__init__.py @@ -1,4 +1,4 @@ -from .base import ExternalMultiModalDataDict, MultiModalPlugin +from .base import MultiModalDataDict, MultiModalPlugin from .registry import MultiModalRegistry MULTIMODAL_REGISTRY = MultiModalRegistry() @@ -14,5 +14,5 @@ "MultiModalPlugin", "MULTIMODAL_REGISTRY", "MultiModalRegistry", - "ExternalMultiModalDataDict", + "MultiModalDataDict", ] diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 8222e94c3c85..04d02c70a381 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -16,12 +16,11 @@ N = TypeVar("N", bound=Type["nn.Module"]) -class ExternalMultiModalDataBuiltins(TypedDict, total=False): +class MultiModalDataBuiltins(TypedDict, total=False): image: Union["Image.Image", "torch.Tensor"] -ExternalMultiModalDataDict = Union[ExternalMultiModalDataBuiltins, Dict[str, - Any]] +MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]] MultiModalInputMapper = Callable[[InputContext, object], Dict[str, "torch.Tensor"]] diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index d5a78d851917..a30a5b9c3afc 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -6,8 +6,7 @@ from vllm.config import ModelConfig from vllm.logger import init_logger -from .base import (ExternalMultiModalDataDict, MultiModalInputMapper, - MultiModalPlugin) +from .base import MultiModalDataDict, MultiModalInputMapper, MultiModalPlugin from .image import ImagePlugin logger = init_logger(__name__) @@ -61,7 +60,7 @@ def register_image_input_mapper( """ return self.register_input_mapper("image", mapper) - def _process_external_input(self, key, value, model_config: ModelConfig): + def _process_input(self, key, value, model_config: ModelConfig): plugin = self._plugins.get(key) if plugin: return plugin.map_input(model_config, value) @@ -93,16 +92,14 @@ def register_image_input(self, """ return self.register_input_mapper("image", mapper) - def map_input(self, model_config: ModelConfig, - data: ExternalMultiModalDataDict): + def map_input(self, model_config: ModelConfig, data: MultiModalDataDict): """ Apply an input mapper to the data passed to the model. See :meth:`MultiModalPlugin.map_input` for more details. """ result_list = [ - self._process_external_input(k, v, model_config) - for k, v in data.items() + self._process_input(k, v, model_config) for k, v in data.items() ] return {k: v for d in result_list for k, v in d.items()} diff --git a/vllm/sequence.py b/vllm/sequence.py index 69d68a6312fe..33b831b48003 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -14,7 +14,7 @@ if TYPE_CHECKING: from vllm.inputs import LLMInputs - from vllm.multimodal import ExternalMultiModalDataDict + from vllm.multimodal import MultiModalDataDict from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics @@ -255,7 +255,7 @@ def prompt_token_ids(self) -> List[int]: return self.inputs["prompt_token_ids"] @property - def multi_modal_data(self) -> "ExternalMultiModalDataDict": + def multi_modal_data(self) -> "MultiModalDataDict": return self.inputs.get("multi_modal_data") or {} @property @@ -432,7 +432,7 @@ def prompt_token_ids(self) -> List[int]: return next(iter(self.seqs_dict.values())).prompt_token_ids @property - def multi_modal_data(self) -> Optional["ExternalMultiModalDataDict"]: + def multi_modal_data(self) -> Optional["MultiModalDataDict"]: # All sequences in the group should have the same multi-modal data. # We use the multi-modal data of an arbitrary sequence. return next(iter(self.seqs_dict.values())).multi_modal_data @@ -614,7 +614,7 @@ def __init__( lora_request: Optional[LoRARequest] = None, computed_block_nums: Optional[List[int]] = None, state: Optional[SequenceGroupState] = None, - multi_modal_data: Optional["ExternalMultiModalDataDict"] = None, + multi_modal_data: Optional["MultiModalDataDict"] = None, encoder_seq_data: Optional[SequenceData] = None, cross_block_table: Optional[List[int]] = None, ) -> None: From a4b5617a3084b37ab5d3e55fb10312c06b3a6c01 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 08:56:54 -0700 Subject: [PATCH 14/21] comments Signed-off-by: Xiaowei Jiang --- vllm/engine/arg_utils.py | 2 +- vllm/multimodal/base.py | 2 +- vllm/multimodal/registry.py | 3 ++- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index b8200ffad391..c9a31c975bea 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -614,7 +614,7 @@ def create_engine_config(self, ) -> EngineConfig: raise ValueError( "BitsAndBytes load format and QLoRA adapter only support " f"'bitsandbytes' quantization, but got {self.quantization}") - if self.image_token_id: + if self.image_token_id is not None: if (not self.image_input_shape or not self.image_feature_size): raise ValueError( 'Specify `image_input_shape` and ' diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 04d02c70a381..65dcd686ed08 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -17,7 +17,7 @@ class MultiModalDataBuiltins(TypedDict, total=False): - image: Union["Image.Image", "torch.Tensor"] + image: "Image.Image" MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]] diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index a30a5b9c3afc..9a297baa7011 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -60,7 +60,8 @@ def register_image_input_mapper( """ return self.register_input_mapper("image", mapper) - def _process_input(self, key, value, model_config: ModelConfig): + def _process_input(self, key: str, value: object, + model_config: ModelConfig): plugin = self._plugins.get(key) if plugin: return plugin.map_input(model_config, value) From 045674d132891684c6de3acccf456c0e0bc2a036 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 11:08:04 -0700 Subject: [PATCH 15/21] fix dist gpu test. Signed-off-by: Xiaowei Jiang --- tests/conftest.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index cc318ee46994..1aa054f05a62 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,8 +5,8 @@ from dataclasses import dataclass from functools import cached_property from pathlib import Path -from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict, - TypeVar) +from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple, + TypedDict, TypeVar) import pytest import torch @@ -22,10 +22,13 @@ destroy_model_parallel) from vllm.inputs import TextPrompt from vllm.logger import init_logger -from vllm.multimodal import MultiModalDataDict from vllm.sequence import SampleLogprobs from vllm.utils import cuda_device_count_stateless, is_cpu +if TYPE_CHECKING: + # it will call torch.cuda.device_count() + from vllm.multimodal import MultiModalDataDict + logger = init_logger(__name__) _TEST_DIR = os.path.dirname(__file__) @@ -428,7 +431,7 @@ def generate( self, prompts: List[str], sampling_params: SamplingParams, - images: Optional[List[MultiModalDataDict]] = None, + images: Optional[List["MultiModalDataDict"]] = None, ) -> List[Tuple[List[List[int]], List[str]]]: if images is not None: assert len(prompts) == len(images) @@ -477,7 +480,7 @@ def generate_greedy( self, prompts: List[str], max_tokens: int, - images: Optional[List[MultiModalDataDict]] = None, + images: Optional[List["MultiModalDataDict"]] = None, ) -> List[Tuple[List[int], str]]: greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) outputs = self.generate(prompts, greedy_params, images=images) From c8fa1505a894801f70fb42b4ec1c52edc7dd43f9 Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 17:41:04 -0700 Subject: [PATCH 16/21] address comments --- vllm/entrypoints/openai/serving_chat.py | 11 +++++------ vllm/model_executor/model_loader/loader.py | 5 ++--- vllm/model_executor/models/llava.py | 1 + vllm/multimodal/utils.py | 6 ++++++ 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 033dcaf0724c..d978199b040f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -27,7 +27,8 @@ from vllm.model_executor.guided_decoding import ( get_guided_decoding_logits_processor) from vllm.multimodal import MultiModalDataDict -from vllm.multimodal.utils import ImageFetchAiohttp, get_full_image_text_prompt +from vllm.multimodal.utils import (async_get_and_parse_image, + get_full_image_text_prompt) from vllm.outputs import RequestOutput from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, @@ -127,10 +128,6 @@ def _parse_chat_message_content_parts( "'image_url.detail' is currently not supported and " "will be ignored.") - async def async_get_and_parse_image(image_url: str): - image = await ImageFetchAiohttp.fetch_image(image_url) - return {"image": image} - mm_future = async_get_and_parse_image(image_url["url"]) mm_futures.append(mm_future) @@ -141,7 +138,9 @@ async def async_get_and_parse_image(image_url: str): if vlm_config is not None and len(mm_futures): - assert len(mm_futures) == 1, "Multiple images is not supported." + assert len( + mm_futures + ) == 1, "Multiple 'image_url' input is currently not supported." (image_token_prompt, image_token_str) = vlm_config.get_image_token_text(self.tokenizer) diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py index 02f12ecffa74..2322e9d39d03 100644 --- a/vllm/model_executor/model_loader/loader.py +++ b/vllm/model_executor/model_loader/loader.py @@ -84,9 +84,8 @@ def _get_model_initialization_kwargs( if supports_vision(model_class): if vlm_config is None: - raise ValueError("Provide vision " - "related configurations through LLM entrypoint " - "or engine arguments.") + raise ValueError("Provide vision related configurations " + "through LLM entrypoint or engine arguments.") extra_kwargs["vlm_config"] = vlm_config diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 5e48c5530a0c..e0134c5c452d 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -110,6 +110,7 @@ def __init__(self, self.config = config self.vlm_config = vlm_config + # TODO: Optionally initializes this for supporting embeddings. self.vision_tower = CLIPVisionModel(config.vision_config) self.multi_modal_projector = LlavaMultiModalProjector( diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index aad9822db78d..19e0ab63ad1f 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -8,6 +8,7 @@ from vllm.config import ModelConfig from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT +from vllm.vllm.multimodal.base import MultiModalDataDict class ImageFetchAiohttp: @@ -86,3 +87,8 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str, raise ValueError( f"Unsupported model type: {config.hf_config.model_type}") return full_prompt + + +async def async_get_and_parse_image(image_url: str) -> MultiModalDataDict: + image = await ImageFetchAiohttp.fetch_image(image_url) + return {"image": image} From b1f1813aac52134bda64df968798a07084e6c31d Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 18:49:45 -0700 Subject: [PATCH 17/21] docs Signed-off-by: Xiaowei Jiang --- docs/requirements-docs.txt | 16 ++++------------ docs/source/dev/multimodal/multimodal_index.rst | 6 ++++-- docs/source/models/vlm.rst | 7 ++++++- vllm/multimodal/registry.py | 2 +- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/docs/requirements-docs.txt b/docs/requirements-docs.txt index ed569816200e..db076b2d801d 100644 --- a/docs/requirements-docs.txt +++ b/docs/requirements-docs.txt @@ -1,13 +1,5 @@ -sphinx == 6.2.1 -sphinx-book-theme == 1.0.1 -sphinx-copybutton == 0.5.2 -myst-parser == 2.0.0 +sphinx==6.2.1 +sphinx-book-theme==1.0.1 +sphinx-copybutton==0.5.2 +myst-parser==2.0.0 sphinx-argparse - -# packages to install to build the documentation -pydantic --f https://download.pytorch.org/whl/cpu -torch -py-cpuinfo -transformers -openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst index 8aa6bd48d7b2..4d5fb3246b68 100644 --- a/docs/source/dev/multimodal/multimodal_index.rst +++ b/docs/source/dev/multimodal/multimodal_index.rst @@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm which allows you to pass in multi-modal input alongside text and token prompts. By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model, -you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data `, -as well as :meth:`MULTIMODAL_REGISTRY.register_input ` for each modality type to support. +you must decorate the model class with :meth:`InputRegistry.register_dummy_data `, +as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper ` for each modality type to support. + +# TODO: Add more instructions on how to do that once embeddings is in. Module Contents +++++++++++++++ diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index fe11af0b90e8..053f5b8609ce 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -48,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`: * ``prompt``: The prompt should have a number of ```` tokens equal to ``image_feature_size``. -* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`~vllm.multimodal.base.MultiModalDataDict`. +* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`. + +.. note:: + + ``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through + :class:`vllm.multimodal.MULTIMODAL_REGISTRY`. .. code-block:: python diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 9a297baa7011..a09a80f89f4b 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -54,7 +54,7 @@ def register_image_input_mapper( mapper: Optional[MultiModalInputMapper] = None, ): """ - Register an input mapper for image pixel data to a model class. + Register an input mapper for image data to a model class. See :meth:`MultiModalPlugin.register_input_mapper` for more details. """ From b8b636d83f67584c31b898bd4cb6b0cd3a45872b Mon Sep 17 00:00:00 2001 From: xwjiang2010 <87673679+xwjiang2010@users.noreply.github.com> Date: Mon, 1 Jul 2024 19:11:12 -0700 Subject: [PATCH 18/21] Update vllm/multimodal/base.py Co-authored-by: Cyrus Leung --- vllm/multimodal/base.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index 65dcd686ed08..e41a814b07af 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -21,6 +21,11 @@ class MultiModalDataBuiltins(TypedDict, total=False): MultiModalDataDict = Union[MultiModalDataBuiltins, Dict[str, Any]] +""" +A dictionary containing an item for each modality type to input. + +The data belonging to each modality is converted into keyword arguments to the model by the corresponding mapper. By default, the mapper of the corresponding plugin with the same modality key is applied. +""" MultiModalInputMapper = Callable[[InputContext, object], Dict[str, "torch.Tensor"]] From 2c1d2912fa9ea5a6af42a11192286f9bc2a4b63f Mon Sep 17 00:00:00 2001 From: Xiaowei Jiang Date: Mon, 1 Jul 2024 19:20:45 -0700 Subject: [PATCH 19/21] format Signed-off-by: Xiaowei Jiang --- vllm/multimodal/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py index e41a814b07af..558cd1175298 100644 --- a/vllm/multimodal/base.py +++ b/vllm/multimodal/base.py @@ -24,7 +24,9 @@ class MultiModalDataBuiltins(TypedDict, total=False): """ A dictionary containing an item for each modality type to input. -The data belonging to each modality is converted into keyword arguments to the model by the corresponding mapper. By default, the mapper of the corresponding plugin with the same modality key is applied. +The data belonging to each modality is converted into keyword arguments +to the model by the corresponding mapper. By default, the mapper of +the corresponding plugin with the same modality key is applied. """ MultiModalInputMapper = Callable[[InputContext, object], Dict[str, From 4278fed0342c78ffcbc49b59bac4d4e1215504a3 Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 1 Jul 2024 22:20:23 -0700 Subject: [PATCH 20/21] fix import error --- vllm/multimodal/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 19e0ab63ad1f..321b51e5a883 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -8,7 +8,7 @@ from vllm.config import ModelConfig from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT -from vllm.vllm.multimodal.base import MultiModalDataDict +from vllm.multimodal.base import MultiModalDataDict class ImageFetchAiohttp: From d9a2908528b8174fa4c8e9b230ac292ac6d973ed Mon Sep 17 00:00:00 2001 From: Roger Wang Date: Mon, 1 Jul 2024 22:23:27 -0700 Subject: [PATCH 21/21] update llava next example --- examples/llava_next_example.py | 61 +++++++++++++++++++--------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/examples/llava_next_example.py b/examples/llava_next_example.py index e90a86abe41c..3c39590e7fb8 100644 --- a/examples/llava_next_example.py +++ b/examples/llava_next_example.py @@ -4,35 +4,44 @@ from PIL import Image from vllm import LLM, SamplingParams -from vllm.multimodal.image import ImagePixelData # Dynamic image input is currently not supported and therefore # a fixed image input shape and its corresponding feature size is required. # See https://github.com/vllm-project/vllm/pull/4199 for the complete # configuration matrix. -llm = LLM( - model="llava-hf/llava-v1.6-mistral-7b-hf", - image_input_type="pixel_values", - image_token_id=32000, - image_input_shape="1,3,336,336", - image_feature_size=1176, -) - -prompt = "[INST] " + "" * 1176 + "\nWhat is shown in this image? [/INST]" -url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" -image = Image.open(BytesIO(requests.get(url).content)) -sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100) - -outputs = llm.generate( - { - "prompt": prompt, - "multi_modal_data": ImagePixelData(image), - }, - sampling_params=sampling_params) - -generated_text = "" -for o in outputs: - generated_text += o.outputs[0].text - -print(f"LLM output:{generated_text}") + +def run_llava_next(): + llm = LLM( + model="llava-hf/llava-v1.6-mistral-7b-hf", + image_token_id=32000, + image_input_shape="1,3,336,336", + image_feature_size=1176, + ) + + prompt = "[INST] " + "" * 1176 + ( + "\nWhat is shown in this image? [/INST]") + url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg" + image = Image.open(BytesIO(requests.get(url).content)) + sampling_params = SamplingParams(temperature=0.8, + top_p=0.95, + max_tokens=100) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": { + "image": image + } + }, + sampling_params=sampling_params) + + generated_text = "" + for o in outputs: + generated_text += o.outputs[0].text + + print(f"LLM output:{generated_text}") + + +if __name__ == "__main__": + run_llava_next()