Skip to content

Commit

Permalink
[Model] Initial support for BLIP-2 (#5920)
Browse files Browse the repository at this point in the history
Co-authored-by: ywang96 <ywang@roblox.com>
  • Loading branch information
DarkLight1337 and ywang96 authored Jul 27, 2024
1 parent ecb33a2 commit 1ad86ac
Show file tree
Hide file tree
Showing 12 changed files with 1,107 additions and 21 deletions.
8 changes: 8 additions & 0 deletions docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ vLLM supports a variety of generative Transformer models in `HuggingFace Transfo
The following is the list of model architectures that are currently supported by vLLM.
Alongside each architecture, we include some popular models that use it.

----

Decoder-only Language Models
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. list-table::
Expand Down Expand Up @@ -186,6 +188,10 @@ Vision Language Models
- Models
- Example HuggingFace Models
- :ref:`LoRA <lora>`
* - :code:`Blip2ForConditionalGeneration`
- BLIP-2
- :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc.
-
* - :code:`ChameleonForConditionalGeneration`
- Chameleon
- :code:`facebook/chameleon-7b` etc.
Expand Down Expand Up @@ -215,6 +221,8 @@ Vision Language Models
- :code:`openbmb/MiniCPM-V-2`, :code:`openbmb/MiniCPM-Llama3-V-2_5`, etc.
-

----

If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` and :ref:`Enabling Multimodal Inputs <enabling_multimodal_inputs>`
for instructions on how to implement support for your model.
Expand Down
11 changes: 11 additions & 0 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,16 @@ def run_minicpmv(question):
return llm, prompt


# BLIP-2
def run_blip2(question):

# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
prompt = f"Question: {question} Answer:"
llm = LLM(model="Salesforce/blip2-opt-2.7b")
return llm, prompt


model_example_map = {
"llava": run_llava,
"llava-next": run_llava_next,
Expand All @@ -114,6 +124,7 @@ def run_minicpmv(question):
"paligemma": run_paligemma,
"chameleon": run_chameleon,
"minicpmv": run_minicpmv,
"blip-2": run_blip2,
}


Expand Down
11 changes: 11 additions & 0 deletions examples/template_blip2.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
{%- for message in messages -%}
{%- if message['role'] == 'user' -%}
{{- 'Question: ' + message['content'] + ' ' -}}
{%- elif message['role'] == 'assistant' -%}
{{- 'Answer: ' + message['content'] + ' ' -}}
{%- endif -%}
{%- endfor -%}

{%- if add_generation_prompt -%}
{{- 'Answer:' -}}
{% endif %}
102 changes: 102 additions & 0 deletions tests/models/test_blip2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
from typing import List, Optional, Tuple

import pytest
from transformers import AutoTokenizer

from vllm.multimodal.utils import rescale_image_size
from vllm.sequence import SampleLogprobs

from ..conftest import IMAGE_ASSETS
from .utils import check_logprobs_close

pytestmark = pytest.mark.vlm

HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"Question: What's the content of the image? Answer:",
"cherry_blossom":
"Question: What is the season? Answer:",
})


def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
Optional[SampleLogprobs]],
model: str):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output

hf_output_str = output_str + "\n"

tokenizer = AutoTokenizer.from_pretrained(model)
hf_output_ids = tokenizer.encode(hf_output_str)
assert hf_output_ids[0] == tokenizer.bos_token_id
hf_output_ids = hf_output_ids[1:]

return hf_output_ids, hf_output_str, out_logprobs


@pytest.mark.parametrize("model", ["Salesforce/blip2-opt-2.7b"])
@pytest.mark.parametrize(
"size_factors",
[
# No image
[],
# Single-scale
[1.0],
# Single-scale, batched
[1.0, 1.0, 1.0],
# Multi-scale
[0.25, 0.5, 1.0],
],
)
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_tokens: int, num_logprobs: int) -> None:
"""Inference result should be the same between hf and vllm.
All the image fixtures for the test is under tests/images.
For huggingface runner, we provide the PIL images as input.
For vllm runner, we provide MultiModalData objects and corresponding
vision language config as input.
Note, the text input is also adjusted to abide by vllm contract.
The text output is sanitized to be able to compare with hf.
"""
images = [asset.pil_image for asset in image_assets]

inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]

# max_model_len should be greater than image_feature_size
with vllm_runner(model, dtype=dtype, enforce_eager=True) as vllm_model:
vllm_outputs_per_image = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]

with hf_runner(model, dtype=dtype, is_vision_model=True) as hf_model:
hf_outputs_per_image = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images)
for prompts, images in inputs_per_image
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
vllm_outputs_per_image):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(vllm_output, model)
for vllm_output in vllm_outputs
],
name_0="hf",
name_1="vllm",
)
8 changes: 4 additions & 4 deletions tests/models/test_fuyu.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ def run_test(
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=vllm_images)
for prompts, vllm_images in inputs_per_image
images=images)
for prompts, images in inputs_per_image
]

with hf_runner(model, dtype=dtype) as hf_model:
Expand All @@ -89,9 +89,9 @@ def run_test(
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images,
images=images,
eos_token_id=eos_token_id)
for prompts, hf_images in inputs_per_image
for prompts, images in inputs_per_image
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
Expand Down
8 changes: 4 additions & 4 deletions tests/models/test_minicpmv.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ def run_test(
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=vllm_images,
images=images,
stop_token_ids=stop_token_ids)
for prompts, vllm_images in inputs_per_image
for prompts, images in inputs_per_image
]

with hf_runner(model, dtype=dtype) as hf_model, torch.no_grad():
Expand All @@ -114,9 +114,9 @@ def to(self, device: torch.types.Device):
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images,
images=images,
tokenizer=tokenizer)
for prompts, hf_images in inputs_per_image
for prompts, images in inputs_per_image
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
Expand Down
8 changes: 4 additions & 4 deletions tests/models/test_phi3v.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ def run_test(
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=vllm_images)
for prompts, vllm_images in inputs_per_image
images=images)
for prompts, images in inputs_per_image
]

# use eager mode for hf runner, since phi3_v didn't work with flash_attn
Expand All @@ -114,9 +114,9 @@ def run_test(
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=hf_images,
images=images,
eos_token_id=eos_token_id)
for prompts, hf_images in inputs_per_image
for prompts, images in inputs_per_image
]

for hf_outputs, vllm_outputs in zip(hf_outputs_per_image,
Expand Down
6 changes: 4 additions & 2 deletions vllm/model_executor/models/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
"BaiChuanForCausalLM": ("baichuan", "BaiChuanForCausalLM"), # baichuan-7b
"BaichuanForCausalLM": ("baichuan", "BaichuanForCausalLM"), # baichuan-13b
"BloomForCausalLM": ("bloom", "BloomForCausalLM"),
"Blip2ForConditionalGeneration":
("blip2", "Blip2ForConditionalGeneration"),
"ChameleonForConditionalGeneration":
("chameleon", "ChameleonForConditionalGeneration"),
"ChatGLMModel": ("chatglm", "ChatGLMForCausalLM"),
Expand Down Expand Up @@ -56,8 +58,8 @@
"OPTForCausalLM": ("opt", "OPTForCausalLM"),
"OrionForCausalLM": ("orion", "OrionForCausalLM"),
"PersimmonForCausalLM": ("persimmon", "PersimmonForCausalLM"),
"PaliGemmaForConditionalGeneration":
("paligemma", "PaliGemmaForConditionalGeneration"),
"PaliGemmaForConditionalGeneration": ("paligemma",
"PaliGemmaForConditionalGeneration"),
"PhiForCausalLM": ("phi", "PhiForCausalLM"),
"Phi3ForCausalLM": ("llama", "LlamaForCausalLM"),
"Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"),
Expand Down
Loading

0 comments on commit 1ad86ac

Please sign in to comment.