From e807125936a9db796746b67ba72c222b5c26582e Mon Sep 17 00:00:00 2001 From: Isotr0py <2037008807@qq.com> Date: Sat, 7 Sep 2024 16:38:23 +0800 Subject: [PATCH] [Model][VLM] Support multi-images inputs for InternVL2 models (#8201) --- docs/source/models/supported_models.rst | 2 +- ...e_inference_vision_language_multi_image.py | 94 +++++++++++++++---- tests/models/test_internvl.py | 92 ++++++++++++++---- tests/models/test_phi3v.py | 8 +- vllm/model_executor/models/internvl.py | 60 +++++++++--- 5 files changed, 199 insertions(+), 57 deletions(-) diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index fe01e1681353..1bb3a448f2c9 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -214,7 +214,7 @@ Multimodal Language Models - * - :code:`InternVLChatModel` - InternVL2 - - Image\ :sup:`E` + - Image\ :sup:`E+` - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - * - :code:`LlavaForConditionalGeneration` diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py index 73543ab5da2b..dd84627b9dc5 100644 --- a/examples/offline_inference_vision_language_multi_image.py +++ b/examples/offline_inference_vision_language_multi_image.py @@ -6,7 +6,9 @@ from argparse import Namespace from typing import List -from vllm import LLM +from transformers import AutoTokenizer + +from vllm import LLM, SamplingParams from vllm.multimodal.utils import fetch_image from vllm.utils import FlexibleArgumentParser @@ -17,36 +19,84 @@ ] -def _load_phi3v(image_urls: List[str]): - return LLM( +def load_phi3v(question, image_urls: List[str]): + llm = LLM( model="microsoft/Phi-3.5-vision-instruct", trust_remote_code=True, max_model_len=4096, limit_mm_per_prompt={"image": len(image_urls)}, ) - - -def run_phi3v_generate(question: str, image_urls: List[str]): - llm = _load_phi3v(image_urls) - placeholders = "\n".join(f"<|image_{i}|>" for i, _ in enumerate(image_urls, start=1)) prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" + stop_token_ids = None + return llm, prompt, stop_token_ids - outputs = llm.generate({ - "prompt": prompt, - "multi_modal_data": { - "image": [fetch_image(url) for url in image_urls] + +def load_internvl(question, image_urls: List[str]): + model_name = "OpenGVLab/InternVL2-2B" + + llm = LLM( + model=model_name, + trust_remote_code=True, + max_num_seqs=5, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + placeholders = "\n".join(f"Image-{i}: \n" + for i, _ in enumerate(image_urls, start=1)) + messages = [{'role': 'user', 'content': f"{placeholders}\n{question}"}] + + tokenizer = AutoTokenizer.from_pretrained(model_name, + trust_remote_code=True) + prompt = tokenizer.apply_chat_template(messages, + tokenize=False, + add_generation_prompt=True) + + # Stop tokens for InternVL + # models variants may have different stop tokens + # please refer to the model card for the correct "stop words": + # https://huggingface.co/OpenGVLab/InternVL2-2B#service + stop_tokens = ["<|endoftext|>", "<|im_start|>", "<|im_end|>", "<|end|>"] + stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens] + return llm, prompt, stop_token_ids + + +model_example_map = { + "phi3_v": load_phi3v, + "internvl_chat": load_internvl, +} + + +def run_generate(model, question: str, image_urls: List[str]): + llm, prompt, stop_token_ids = model_example_map[model](question, + image_urls) + + sampling_params = SamplingParams(temperature=0.0, + max_tokens=128, + stop_token_ids=stop_token_ids) + + outputs = llm.generate( + { + "prompt": prompt, + "multi_modal_data": { + "image": [fetch_image(url) for url in image_urls] + }, }, - }) + sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text print(generated_text) -def run_phi3v_chat(question: str, image_urls: List[str]): - llm = _load_phi3v(image_urls) +def run_chat(model: str, question: str, image_urls: List[str]): + llm, _, stop_token_ids = model_example_map[model](question, image_urls) + + sampling_params = SamplingParams(temperature=0.0, + max_tokens=128, + stop_token_ids=stop_token_ids) outputs = llm.chat([{ "role": @@ -63,7 +113,8 @@ def run_phi3v_chat(question: str, image_urls: List[str]): }, } for image_url in image_urls), ], - }]) + }], + sampling_params=sampling_params) for o in outputs: generated_text = o.outputs[0].text @@ -71,12 +122,13 @@ def run_phi3v_chat(question: str, image_urls: List[str]): def main(args: Namespace): + model = args.model_type method = args.method if method == "generate": - run_phi3v_generate(QUESTION, IMAGE_URLS) + run_generate(model, QUESTION, IMAGE_URLS) elif method == "chat": - run_phi3v_chat(QUESTION, IMAGE_URLS) + run_chat(model, QUESTION, IMAGE_URLS) else: raise ValueError(f"Invalid method: {method}") @@ -85,6 +137,12 @@ def main(args: Namespace): parser = FlexibleArgumentParser( description='Demo on using vLLM for offline inference with ' 'vision language models that support multi-image input') + parser.add_argument('--model-type', + '-m', + type=str, + default="phi3_v", + choices=model_example_map.keys(), + help='Huggingface "model_type".') parser.add_argument("--method", type=str, default="generate", diff --git a/tests/models/test_internvl.py b/tests/models/test_internvl.py index 42732cebc656..fa3369dc5334 100644 --- a/tests/models/test_internvl.py +++ b/tests/models/test_internvl.py @@ -1,5 +1,5 @@ import types -from typing import List, Optional, Tuple, Type +from typing import List, Optional, Tuple, Type, Union import pytest import torch @@ -9,7 +9,8 @@ from vllm.multimodal.utils import rescale_image_size from vllm.utils import is_cpu -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets +from ..conftest import (IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner, + _ImageAssets) from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -20,6 +21,7 @@ "cherry_blossom": "<|im_start|>User\n\nWhat is the season?<|im_end|>\n<|im_start|>Assistant\n", # noqa: E501 }) +HF_MULTIIMAGE_IMAGE_PROMPT = "<|im_start|>User\nImage-1: \nImage-2: \nDescribe the two images in detail.<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501 models = [ "OpenGVLab/InternVL2-1B", @@ -64,13 +66,13 @@ def generate( def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - image_assets: _ImageAssets, + inputs: List[Tuple[List[str], PromptImageInput]], model: str, *, - size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, + mm_limit: int, tensor_parallel_size: int, distributed_executor_backend: Optional[str] = None, ): @@ -83,12 +85,6 @@ def run_test( Note, the text input is also adjusted to abide by vllm contract. The text output is sanitized to be able to compare with hf. """ - images = [asset.pil_image for asset in image_assets] - - inputs_per_image = [( - [prompt for _ in size_factors], - [rescale_image_size(image, factor) for factor in size_factors], - ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] # NOTE: take care of the order. run vLLM first, and then run HF. # vLLM needs a fresh new process without cuda initialization. @@ -110,13 +106,21 @@ def __init__(self, hf_runner: HfRunner): self.max_num = self.config.max_dynamic_patch self.image_size = self.vision_config.image_size - def __call__(self, text: str, images: Image, **kwargs): + def __call__(self, text: str, images: Union[Image, List[Image]], + **kwargs): from vllm.model_executor.models.internvl import ( IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values) - pixel_values = image_to_pixel_values( - images, self.image_size, self.min_num, self.max_num, - self.use_thumbnail).to(self.dtype) - num_patches_list = [pixel_values.shape[0]] + images = [images] if isinstance(images, Image) else images + pixel_values = [ + image_to_pixel_values(image, self.image_size, self.min_num, + self.max_num, + self.use_thumbnail).to(self.dtype) + for image in images + ] + num_patches_list = [ + pixel_value.shape[0] for pixel_value in pixel_values + ] + pixel_values = torch.cat(pixel_values, dim=0) for num_patches in num_patches_list: context_tokens = IMG_CONTEXT * self.num_image_token \ * num_patches @@ -130,6 +134,7 @@ def __call__(self, text: str, images: Image, **kwargs): with vllm_runner(model, max_model_len=4096, dtype=dtype, + limit_mm_per_prompt={"image": mm_limit}, tensor_parallel_size=tensor_parallel_size, distributed_executor_backend=distributed_executor_backend, enforce_eager=True) as vllm_model: @@ -138,7 +143,7 @@ def __call__(self, text: str, images: Image, **kwargs): max_tokens, num_logprobs=num_logprobs, images=images) - for prompts, images in inputs_per_image + for prompts, images in inputs ] with hf_runner(model, dtype=dtype) as hf_model: @@ -156,7 +161,7 @@ def __call__(self, text: str, images: Image, **kwargs): num_logprobs=num_logprobs, images=hf_images, eos_token_id=eos_token_id) - for prompts, hf_images in inputs_per_image + for prompts, hf_images in inputs ] for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, @@ -264,15 +269,64 @@ def run_awq_test( @torch.inference_mode() def test_models(hf_runner, vllm_runner, image_assets, model, size_factors, dtype: str, max_tokens: int, num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + run_test( hf_runner, vllm_runner, - image_assets, + inputs_per_image, + model, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + mm_limit=1, + tensor_parallel_size=1, + ) + + +@pytest.mark.parametrize("model", models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.5, 0.75, 1.0], + ], +) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +@torch.inference_mode() +def test_multi_images_models(hf_runner, vllm_runner, image_assets, model, + size_factors, dtype: str, max_tokens: int, + num_logprobs: int) -> None: + images = [asset.pil_image for asset in image_assets] + + inputs_per_case = [ + ([HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors], + [[rescale_image_size(image, factor) for image in images] + for factor in size_factors]) + ] + + run_test( + hf_runner, + vllm_runner, + inputs_per_case, model, - size_factors=size_factors, dtype=dtype, max_tokens=max_tokens, num_logprobs=num_logprobs, + mm_limit=2, tensor_parallel_size=1, ) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index e416a85b8962..6ecbf07a08b7 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -1,16 +1,15 @@ import os import re -from typing import List, Optional, Tuple, Type, Union +from typing import List, Optional, Tuple, Type import pytest -from PIL import Image from transformers import AutoTokenizer from vllm.multimodal.utils import rescale_image_size from vllm.sequence import SampleLogprobs from vllm.utils import is_cpu, is_hip -from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner +from ..conftest import IMAGE_ASSETS, HfRunner, PromptImageInput, VllmRunner from .utils import check_logprobs_close pytestmark = pytest.mark.vlm @@ -60,8 +59,7 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str, def run_test( hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - inputs: List[Tuple[List[str], Union[List[Image.Image], - List[List[Image.Image]]]]], + inputs: List[Tuple[List[str], PromptImageInput]], model: str, *, dtype: str, diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 10fbb5663d27..0cf63d9e1fb2 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -5,6 +5,7 @@ # Licensed under The MIT License [see LICENSE for details] # -------------------------------------------------------- import itertools +import re from typing import (Iterable, List, Literal, Mapping, Optional, Tuple, TypedDict, Union) @@ -26,6 +27,7 @@ from vllm.multimodal.base import MultiModalInputs from vllm.multimodal.utils import cached_get_tokenizer from vllm.sequence import IntermediateTensors +from vllm.utils import is_list_of from .clip import (dummy_image_for_clip, dummy_seq_data_for_clip, get_clip_num_patches) @@ -95,8 +97,8 @@ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, - max_num: int, - image_size: int) -> Tuple[int, int, int]: + max_num: int, image_size: int, + use_thumbnail: bool) -> Tuple[int, int, int]: aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio @@ -114,17 +116,26 @@ def calculate_num_blocks(orig_width: int, orig_height: int, min_num: int, target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + # add thumbnail image if num_blocks > 1 + if use_thumbnail and blocks > 1: + blocks += 1 return blocks, target_width, target_height # adapted from https://huggingface.co/OpenGVLab/InternVL2-1B def dynamic_preprocess(image: Image.Image, min_num: int, max_num: int, image_size: int, - use_thumbnail: int) -> List[Image.Image]: + use_thumbnail: bool) -> List[Image.Image]: orig_width, orig_height = image.size + # calculate the number of blocks without thumbnail blocks, target_width, target_height = calculate_num_blocks( - orig_width, orig_height, min_num, max_num, image_size) + orig_width, + orig_height, + min_num, + max_num, + image_size, + use_thumbnail=False) # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] @@ -197,17 +208,23 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): downsample_ratio) image_data = multi_modal_data["image"] + min_num = hf_config.min_dynamic_patch + max_num = hf_config.max_dynamic_patch + use_thumbnail = hf_config.use_thumbnail if isinstance(image_data, Image.Image): width, height = image_data.size - min_num = hf_config.min_dynamic_patch - max_num = hf_config.max_dynamic_patch num_blocks, _, _ = calculate_num_blocks(width, height, min_num, - max_num, image_size) - # add thumbnail image if num_blocks > 1 - if hf_config.use_thumbnail and num_blocks > 1: - num_blocks += 1 - image_feature_size = num_blocks * num_patches - + max_num, image_size, + use_thumbnail) + image_feature_size = [num_blocks * num_patches] + elif is_list_of(image_data, Image.Image): + image_feature_size = [] + for image in image_data: + width, height = image.size + num_blocks, _, _ = calculate_num_blocks(width, height, min_num, + max_num, image_size, + use_thumbnail) + image_feature_size.append(num_blocks * num_patches) elif isinstance(image_data, torch.Tensor): num_images, image_feature_size, hidden_size = image_data.shape else: @@ -220,8 +237,14 @@ def input_processor_for_internvl(ctx: InputContext, llm_inputs: LLMInputs): prompt_token_ids = llm_inputs["prompt_token_ids"] if prompt is None: prompt = tokenizer.decode(prompt_token_ids) - image_prompt = IMG_START + IMG_CONTEXT * image_feature_size + IMG_END - new_prompt = prompt.replace('', image_prompt, 1) + + new_prompt = prompt + image_idx = sorted(map(int, re.findall(r"Image-(\d+): \n", prompt))) + for idx, feature_size in enumerate(image_feature_size, start=1): + image_prompt = IMG_START + IMG_CONTEXT * feature_size + IMG_END + if not image_idx: + image_prompt = f"Image-{idx}: {image_prompt}" + new_prompt = new_prompt.replace('', image_prompt, 1) new_prompt_token_ids = tokenizer.encode(new_prompt) return LLMInputs(prompt=prompt, @@ -245,6 +268,15 @@ def input_mapper_for_internvl(ctx: InputContext, data: object): use_thumbnail=use_thumbnail) # Add an N dimension for number of images per prompt (currently 1). data = data.unsqueeze(0) + elif is_list_of(data, Image.Image): + data = [ + image_to_pixel_values(img, + image_size, + min_num, + max_num, + use_thumbnail=use_thumbnail) for img in data + ] + data = torch.stack(data) model_config = ctx.model_config tokenizer = cached_get_tokenizer(model_config.tokenizer, trust_remote_code=True)