Skip to content
This repository was archived by the owner on Oct 11, 2024. It is now read-only.

Commit 7f6091e

Browse files
xwjiang2010DarkLight1337ywang96
authored andcommitted
[VLM] Remove image_input_type from VLM config (vllm-project#5852)
Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
1 parent 72cd745 commit 7f6091e

35 files changed

+325
-747
lines changed

.buildkite/download-images.sh

-4
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,6 @@ set -o pipefail
88
# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
99
mkdir -p images
1010
cd images
11-
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
12-
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
13-
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
14-
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
1511
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
1612
wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
1713

docs/requirements-docs.txt

+4-12
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,5 @@
1-
sphinx == 6.2.1
2-
sphinx-book-theme == 1.0.1
3-
sphinx-copybutton == 0.5.2
4-
myst-parser == 2.0.0
1+
sphinx==6.2.1
2+
sphinx-book-theme==1.0.1
3+
sphinx-copybutton==0.5.2
4+
myst-parser==2.0.0
55
sphinx-argparse
6-
7-
# packages to install to build the documentation
8-
pydantic
9-
-f https://download.pytorch.org/whl/cpu
10-
torch
11-
py-cpuinfo
12-
transformers
13-
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args

docs/source/dev/multimodal/multimodal_index.rst

+5-3
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,10 @@ vLLM provides experimental support for multi-modal models through the :mod:`vllm
99
which allows you to pass in multi-modal input alongside text and token prompts.
1010

1111
By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
12-
you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
13-
as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
12+
you must decorate the model class with :meth:`InputRegistry.register_dummy_data <vllm.inputs.registry.InputRegistry.register_dummy_data>`,
13+
as well as :meth:`MULTIMODAL_REGISTRY.register_input_mapper <MultiModalRegistry.register_input_mapper>` for each modality type to support.
14+
15+
# TODO: Add more instructions on how to do that once embeddings is in.
1416

1517
Module Contents
1618
+++++++++++++++
@@ -29,7 +31,7 @@ Registry
2931
Base Classes
3032
------------
3133

32-
.. autoclass:: vllm.multimodal.MultiModalData
34+
.. autoclass:: vllm.multimodal.MultiModalDataDict
3335
:members:
3436
:show-inheritance:
3537

docs/source/models/vlm.rst

+7-4
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,6 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
3636
3737
llm = LLM(
3838
model="llava-hf/llava-1.5-7b-hf",
39-
image_input_type="pixel_values",
4039
image_token_id=32000,
4140
image_input_shape="1,3,336,336",
4241
image_feature_size=576,
@@ -49,7 +48,12 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
4948
To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
5049

5150
* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
52-
* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
51+
* ``multi_modal_data``: This is a dictionary that follows the schema defined in :class:`vllm.multimodal.MultiModalDataDict`.
52+
53+
.. note::
54+
55+
``multi_modal_data`` can accept keys and values beyond the builtin ones, as long as a customized plugin is registered through
56+
:class:`vllm.multimodal.MULTIMODAL_REGISTRY`.
5357

5458
.. code-block:: python
5559
@@ -61,7 +65,7 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptS
6165
6266
outputs = llm.generate({
6367
"prompt": prompt,
64-
"multi_modal_data": ImagePixelData(image),
68+
"multi_modal_data": {"image": image},
6569
})
6670
6771
for o in outputs:
@@ -93,7 +97,6 @@ Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with
9397
9498
python -m vllm.entrypoints.openai.api_server \
9599
--model llava-hf/llava-1.5-7b-hf \
96-
--image-input-type pixel_values \
97100
--image-token-id 32000 \
98101
--image-input-shape 1,3,336,336 \
99102
--image-feature-size 576 \

examples/llava_example.py

+8-48
Original file line numberDiff line numberDiff line change
@@ -1,84 +1,44 @@
1-
import argparse
21
import os
32
import subprocess
43

5-
import torch
64
from PIL import Image
75

86
from vllm import LLM
9-
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
107

118
# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
129
# You can use `.buildkite/download-images.sh` to download them
1310

1411

15-
def run_llava_pixel_values(*, disable_image_processor: bool = False):
12+
def run_llava():
1613
llm = LLM(
1714
model="llava-hf/llava-1.5-7b-hf",
18-
image_input_type="pixel_values",
1915
image_token_id=32000,
2016
image_input_shape="1,3,336,336",
2117
image_feature_size=576,
22-
disable_image_processor=disable_image_processor,
2318
)
2419

2520
prompt = "<image>" * 576 + (
2621
"\nUSER: What is the content of this image?\nASSISTANT:")
2722

28-
if disable_image_processor:
29-
image = torch.load("images/stop_sign_pixel_values.pt")
30-
else:
31-
image = Image.open("images/stop_sign.jpg")
23+
image = Image.open("images/stop_sign.jpg")
3224

3325
outputs = llm.generate({
3426
"prompt": prompt,
35-
"multi_modal_data": ImagePixelData(image),
27+
"multi_modal_data": {
28+
"image": image
29+
},
3630
})
3731

3832
for o in outputs:
3933
generated_text = o.outputs[0].text
4034
print(generated_text)
4135

4236

43-
def run_llava_image_features():
44-
llm = LLM(
45-
model="llava-hf/llava-1.5-7b-hf",
46-
image_input_type="image_features",
47-
image_token_id=32000,
48-
image_input_shape="1,576,1024",
49-
image_feature_size=576,
50-
)
51-
52-
prompt = "<image>" * 576 + (
53-
"\nUSER: What is the content of this image?\nASSISTANT:")
54-
55-
image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
56-
57-
outputs = llm.generate({
58-
"prompt": prompt,
59-
"multi_modal_data": ImageFeatureData(image),
60-
})
61-
62-
for o in outputs:
63-
generated_text = o.outputs[0].text
64-
print(generated_text)
65-
66-
67-
def main(args):
68-
if args.type == "pixel_values":
69-
run_llava_pixel_values()
70-
else:
71-
run_llava_image_features()
37+
def main():
38+
run_llava()
7239

7340

7441
if __name__ == "__main__":
75-
parser = argparse.ArgumentParser(description="Demo on Llava")
76-
parser.add_argument("--type",
77-
type=str,
78-
choices=["pixel_values", "image_features"],
79-
default="pixel_values",
80-
help="image input type")
81-
args = parser.parse_args()
8242
# Download from s3
8343
s3_bucket_path = "s3://air-example-data-2/vllm_opensource_llava/"
8444
local_directory = "images"
@@ -95,4 +55,4 @@ def main(args):
9555
local_directory,
9656
"--no-sign-request",
9757
])
98-
main(args)
58+
main()

examples/llava_next_example.py

+35-26
Original file line numberDiff line numberDiff line change
@@ -4,35 +4,44 @@
44
from PIL import Image
55

66
from vllm import LLM, SamplingParams
7-
from vllm.multimodal.image import ImagePixelData
87

98
# Dynamic image input is currently not supported and therefore
109
# a fixed image input shape and its corresponding feature size is required.
1110
# See https://github.com/vllm-project/vllm/pull/4199 for the complete
1211
# configuration matrix.
1312

14-
llm = LLM(
15-
model="llava-hf/llava-v1.6-mistral-7b-hf",
16-
image_input_type="pixel_values",
17-
image_token_id=32000,
18-
image_input_shape="1,3,336,336",
19-
image_feature_size=1176,
20-
)
21-
22-
prompt = "[INST] " + "<image>" * 1176 + "\nWhat is shown in this image? [/INST]"
23-
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
24-
image = Image.open(BytesIO(requests.get(url).content))
25-
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=100)
26-
27-
outputs = llm.generate(
28-
{
29-
"prompt": prompt,
30-
"multi_modal_data": ImagePixelData(image),
31-
},
32-
sampling_params=sampling_params)
33-
34-
generated_text = ""
35-
for o in outputs:
36-
generated_text += o.outputs[0].text
37-
38-
print(f"LLM output:{generated_text}")
13+
14+
def run_llava_next():
15+
llm = LLM(
16+
model="llava-hf/llava-v1.6-mistral-7b-hf",
17+
image_token_id=32000,
18+
image_input_shape="1,3,336,336",
19+
image_feature_size=1176,
20+
)
21+
22+
prompt = "[INST] " + "<image>" * 1176 + (
23+
"\nWhat is shown in this image? [/INST]")
24+
url = "https://h2o-release.s3.amazonaws.com/h2ogpt/bigben.jpg"
25+
image = Image.open(BytesIO(requests.get(url).content))
26+
sampling_params = SamplingParams(temperature=0.8,
27+
top_p=0.95,
28+
max_tokens=100)
29+
30+
outputs = llm.generate(
31+
{
32+
"prompt": prompt,
33+
"multi_modal_data": {
34+
"image": image
35+
}
36+
},
37+
sampling_params=sampling_params)
38+
39+
generated_text = ""
40+
for o in outputs:
41+
generated_text += o.outputs[0].text
42+
43+
print(f"LLM output:{generated_text}")
44+
45+
46+
if __name__ == "__main__":
47+
run_llava_next()

examples/openai_vision_api_client.py

-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
Launch the vLLM server with the following command:
44
python -m vllm.entrypoints.openai.api_server \
55
--model llava-hf/llava-1.5-7b-hf \
6-
--image-input-type pixel_values \
76
--image-token-id 32000 \
87
--image-input-shape 1,3,336,336 \
98
--image-feature-size 576 \

examples/phi3v_example.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
from PIL import Image
55

66
from vllm import LLM, SamplingParams
7-
from vllm.multimodal.image import ImagePixelData
87

98

109
def run_phi3v():
@@ -17,7 +16,6 @@ def run_phi3v():
1716
llm = LLM(
1817
model=model_path,
1918
trust_remote_code=True,
20-
image_input_type="pixel_values",
2119
image_token_id=32044,
2220
image_input_shape="1,3,1008,1344",
2321
image_feature_size=1921,
@@ -35,7 +33,9 @@ def run_phi3v():
3533
outputs = llm.generate(
3634
{
3735
"prompt": prompt,
38-
"multi_modal_data": ImagePixelData(image),
36+
"multi_modal_data": {
37+
"image": image
38+
},
3939
},
4040
sampling_params=sampling_params)
4141
for o in outputs:

tests/conftest.py

+8-30
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,17 @@
1717
AutoTokenizer, BatchEncoding)
1818

1919
from vllm import LLM, SamplingParams
20-
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
20+
from vllm.config import TokenizerPoolConfig
2121
from vllm.distributed import (destroy_distributed_environment,
2222
destroy_model_parallel)
2323
from vllm.inputs import TextPrompt
2424
from vllm.logger import init_logger
25+
from vllm.sequence import SampleLogprobs
26+
from vllm.utils import cuda_device_count_stateless, is_cpu
2527

2628
if TYPE_CHECKING:
27-
from vllm.multimodal import MultiModalData
28-
else:
2929
# it will call torch.cuda.device_count()
30-
MultiModalData = None
31-
from vllm.sequence import SampleLogprobs
32-
from vllm.utils import cuda_device_count_stateless, is_cpu
30+
from vllm.multimodal import MultiModalDataDict
3331

3432
logger = init_logger(__name__)
3533

@@ -51,35 +49,15 @@ def _read_prompts(filename: str) -> List[str]:
5149
class ImageAsset:
5250
name: Literal["stop_sign", "cherry_blossom"]
5351

54-
@cached_property
55-
def pixel_values(self) -> torch.Tensor:
56-
return torch.load(_IMAGE_DIR / f"{self.name}_pixel_values.pt")
57-
58-
@cached_property
59-
def image_features(self) -> torch.Tensor:
60-
return torch.load(_IMAGE_DIR / f"{self.name}_image_features.pt")
61-
6252
@cached_property
6353
def pil_image(self) -> Image.Image:
6454
return Image.open(_IMAGE_DIR / f"{self.name}.jpg")
6555

6656
def for_hf(self) -> Image.Image:
6757
return self.pil_image
6858

69-
def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
70-
# don't put this import at the top level
71-
# it will call torch.cuda.device_count()
72-
from vllm.multimodal.image import ImageFeatureData # noqa: F401
73-
from vllm.multimodal.image import ImagePixelData
74-
image_input_type = vision_config.image_input_type
75-
ImageInputType = VisionLanguageConfig.ImageInputType
76-
77-
if image_input_type == ImageInputType.IMAGE_FEATURES:
78-
return ImageFeatureData(self.image_features)
79-
if image_input_type == ImageInputType.PIXEL_VALUES:
80-
return ImagePixelData(self.pil_image)
81-
82-
raise NotImplementedError
59+
def for_vllm(self) -> Dict[str, Any]:
60+
return {"image": self.pil_image}
8361

8462

8563
class _ImageAssetPrompts(TypedDict):
@@ -637,7 +615,7 @@ def generate(
637615
self,
638616
prompts: List[str],
639617
sampling_params: SamplingParams,
640-
images: Optional[List[MultiModalData]] = None,
618+
images: Optional[List["MultiModalDataDict"]] = None,
641619
) -> List[Tuple[List[List[int]], List[str]]]:
642620
if images is not None:
643621
assert len(prompts) == len(images)
@@ -686,7 +664,7 @@ def generate_greedy(
686664
self,
687665
prompts: List[str],
688666
max_tokens: int,
689-
images: Optional[List[MultiModalData]] = None,
667+
images: Optional[List["MultiModalDataDict"]] = None,
690668
) -> List[Tuple[List[int], str]]:
691669
greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
692670
outputs = self.generate(prompts, greedy_params, images=images)

tests/entrypoints/openai/test_vision.py

-2
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,6 @@ def server():
4444
"--max-model-len",
4545
"4096",
4646
"--enforce-eager",
47-
"--image-input-type",
48-
"pixel_values",
4947
"--image-token-id",
5048
"32000",
5149
"--image-input-shape",

0 commit comments

Comments
 (0)