Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Model] Initialize support for InternVL2 series models #6514

Merged
merged 42 commits into from
Jul 29, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
4102e8c
init internvl support
Isotr0py Jul 10, 2024
2feadc8
add internvl2 support for 1B and 2B
Isotr0py Jul 13, 2024
958a926
add internvl2 example
Isotr0py Jul 17, 2024
7142c60
add internvl2 test
Isotr0py Jul 17, 2024
26c2651
fix internvl2 test
Isotr0py Jul 17, 2024
6e8135c
Merge branch 'vllm-project:main' into internvl
Isotr0py Jul 17, 2024
9d8cb20
format code
Isotr0py Jul 17, 2024
45b0cd7
add docs
Isotr0py Jul 17, 2024
1dafa5e
remove unused code in test
Isotr0py Jul 17, 2024
1fbe958
fix dummy data for internvl
Isotr0py Jul 17, 2024
91572b9
format code
Isotr0py Jul 17, 2024
56e171f
update internvl example
Isotr0py Jul 20, 2024
5971c11
fix internvl-2B test
Isotr0py Jul 21, 2024
cf1784c
Merge branch 'vllm-project:main' into internvl
Isotr0py Jul 25, 2024
ea61600
fix internvl test
Isotr0py Jul 25, 2024
49bdf60
format internvl2 test
Isotr0py Jul 25, 2024
f2d6bdd
add timm to test requirements
Isotr0py Jul 25, 2024
8aa0ac7
fix internvl test
Isotr0py Jul 25, 2024
95d8b4f
Merge branch 'main' into internvl
Isotr0py Jul 26, 2024
9bee8a8
port and format internvl config
Isotr0py Jul 26, 2024
4d9946c
format code
Isotr0py Jul 26, 2024
4dae318
isort
Isotr0py Jul 26, 2024
eea984f
format stacked_params_mapping
Isotr0py Jul 26, 2024
1b7c795
remove broken 4B test
Isotr0py Jul 27, 2024
6fb37ec
Merge branch 'main' into internvl
Isotr0py Jul 27, 2024
9f3cd25
fix a typo
Isotr0py Jul 27, 2024
b861119
Merge branch 'vllm-project:main' into internvl
Isotr0py Jul 27, 2024
405162e
migrate internvl example
Isotr0py Jul 27, 2024
c2e8cba
use sdpa for internvl attention
Isotr0py Jul 27, 2024
de8573d
update chat_utils and model_type
Isotr0py Jul 27, 2024
aeb10a9
reorganize internvl code
Isotr0py Jul 27, 2024
443650e
Merge branch 'main' into internvl
Isotr0py Jul 27, 2024
a32fcbe
refactor intern_vit
Isotr0py Jul 27, 2024
61df94f
fix example template
Isotr0py Jul 27, 2024
daa2233
fix test internvl prompt format
Isotr0py Jul 27, 2024
72d8350
format code
Isotr0py Jul 27, 2024
eb914ff
Fix internvl input processor
Isotr0py Jul 27, 2024
c243d71
separate llm_class selection
Isotr0py Jul 28, 2024
10c11e3
isort
Isotr0py Jul 28, 2024
e17c0be
revert llm backbone selection
Isotr0py Jul 28, 2024
a846154
remove template and fix typo
Isotr0py Jul 29, 2024
26fe0b0
update
ywang96 Jul 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,10 @@ Vision Language Models
- Fuyu
- :code:`adept/fuyu-8b` etc.
-
* - :code:`InternVLChatModel`
- InternVL2
- :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc.
-
* - :code:`LlavaForConditionalGeneration`
- LLaVA-1.5
- :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
Expand Down
39 changes: 39 additions & 0 deletions examples/internvl_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
from PIL import Image

from vllm import LLM, SamplingParams

# The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
# You can use `.buildkite/download-images.sh` to download them


def run_internvl():
model_path = "OpenGVLab/InternVL2-4B"

llm = LLM(
model=model_path,
max_model_len=4096,
trust_remote_code=True,
max_num_seqs=5,
)

image = Image.open("images/stop_sign.jpg")
Isotr0py marked this conversation as resolved.
Show resolved Hide resolved

# single-image prompt
prompt = "<image>\nWhat is the content of this image?\n" # noqa: E501
sampling_params = SamplingParams(temperature=0, max_tokens=128)

outputs = llm.generate(
{
"prompt": prompt,
"multi_modal_data": {
"image": image
},
},
sampling_params=sampling_params)
for o in outputs:
generated_text = o.outputs[0].text
print(generated_text)


if __name__ == "__main__":
run_internvl()
157 changes: 157 additions & 0 deletions tests/models/test_internvl2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from typing import List, Optional, Type

import pytest
from PIL.Image import Image

from vllm.model_executor.models.internvl import (IMG_CONTEXT, IMG_END,
IMG_START,
image_to_pixel_values)
from vllm.multimodal.utils import rescale_image_size
from vllm.utils import is_cpu

from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
from .utils import check_logprobs_close

pytestmark = pytest.mark.vlm

HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<image>\nWhat's the content of the image?\n",
"cherry_blossom":
"<image>\nWhat is the season?\n",
})

models = ["OpenGVLab/InternVL2-4B", "OpenGVLab/InternVL2-8B"]


class InternVLProcessor:
"""A simple processor for InternVL2 HF model which misses a processor."""

def __init__(self, hf_runner: HfRunner):
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.dtype = hf_runner.model.dtype

def __call__(self, text: str, images: Image, **kwargs):
pixel_values = image_to_pixel_values(images).to(self.dtype)
num_patches_list = [pixel_values.shape[0]]
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt


def run_test(
hf_runner: Type[HfRunner],
vllm_runner: Type[VllmRunner],
image_assets: _ImageAssets,
model: str,
*,
size_factors: List[float],
dtype: str,
max_tokens: int,
num_logprobs: int,
tensor_parallel_size: int,
distributed_executor_backend: Optional[str] = None,
):
"""Inference result should be the same between hf and vllm.

All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalDataDict objects
and corresponding vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).

# max_model_len should be greater than image_feature_size
with vllm_runner(model,
max_model_len=2048,
dtype=dtype,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=True) as vllm_model:
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]

with hf_runner(model, dtype=dtype) as hf_model:
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = InternVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images)
for prompts, hf_images in inputs_per_image
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
# TODO: Check whether using original CLIPVisionModel can improve
# consistency against HF
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
name_0="hf",
name_1="vllm",
)


target_dtype = "half"
if is_cpu():
target_dtype = "bfloat16"


@pytest.mark.parametrize("model", models)
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
run_test(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors=size_factors,
dtype=dtype,
max_tokens=max_tokens,
num_logprobs=num_logprobs,
tensor_parallel_size=1,
)
1 change: 1 addition & 0 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
"GPTNeoXForCausalLM": ("gpt_neox", "GPTNeoXForCausalLM"),
"InternLMForCausalLM": ("llama", "LlamaForCausalLM"),
"InternLM2ForCausalLM": ("internlm2", "InternLM2ForCausalLM"),
"InternVLChatModel": ("internvl", "InternVLChatModel"),
"JAISLMHeadModel": ("jais", "JAISLMHeadModel"),
"LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
"LlavaForConditionalGeneration":
Expand Down
10 changes: 9 additions & 1 deletion vllm/model_executor/models/internlm2.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,22 @@ def __init__(
])
self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)

def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor:
return self.tok_embeddings(input_ids)

def forward(
self,
input_ids: torch.Tensor,
positions: torch.Tensor,
kv_caches: List[torch.Tensor],
attn_metadata: AttentionMetadata,
intermediate_tensors: IntermediateTensors = None,
inputs_embeds: Optional[torch.Tensor] = None,
) -> torch.Tensor:
hidden_states = self.tok_embeddings(input_ids)
if inputs_embeds is not None:
hidden_states = inputs_embeds
else:
hidden_states = self.tok_embeddings(input_ids)
residual = None
for i in range(len(self.layers)):
layer = self.layers[i]
Expand Down
Loading
Loading