Skip to content

Commit

Permalink
[Model][VLM] Add LLaVA-Onevision model support (vllm-project#8486)
Browse files Browse the repository at this point in the history
Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com>
Co-authored-by: Roger Wang <ywang@roblox.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
Signed-off-by: Amit Garg <mitgarg17495@gmail.com>
  • Loading branch information
5 people authored and garg-amit committed Oct 28, 2024
1 parent 219326d commit 99ee7a8
Show file tree
Hide file tree
Showing 10 changed files with 1,330 additions and 21 deletions.
7 changes: 6 additions & 1 deletion docs/source/models/supported_models.rst
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,11 @@ Multimodal Language Models
- Video
- :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
-
* - :code:`LlavaOnevisionForConditionalGeneration`
- LLaVA-Onevision
- Image\ :sup:`+` / Video
- :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. (see note)
-
* - :code:`MiniCPMV`
- MiniCPM-V
- Image\ :sup:`+`
Expand Down Expand Up @@ -288,7 +293,7 @@ Multimodal Language Models
For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630

.. note::
For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
For :code:`LLaVA-NeXT-Video`, :code:`LLaVA-Onevision` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
This can be installed by running the following command:

.. code-block:: bash
Expand Down
60 changes: 47 additions & 13 deletions examples/offline_inference_vision_language.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@


# LLaVA-1.5
def run_llava(question):
def run_llava(question, modality):
assert modality == "image"

prompt = f"USER: <image>\n{question}\nASSISTANT:"

Expand All @@ -24,7 +25,8 @@ def run_llava(question):


# LLaVA-1.6/LLaVA-NeXT
def run_llava_next(question):
def run_llava_next(question, modality):
assert modality == "image"

prompt = f"[INST] <image>\n{question} [/INST]"
llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
Expand All @@ -34,15 +36,35 @@ def run_llava_next(question):

# LlaVA-NeXT-Video
# Currently only support for video input
def run_llava_next_video(question):
def run_llava_next_video(question, modality):
assert modality == "video"

prompt = f"USER: <video>\n{question} ASSISTANT:"
llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
stop_token_ids = None
return llm, prompt, stop_token_ids


# LLaVA-OneVision
def run_llava_onevision(question, modality):

if modality == "video":
prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
<|im_start|>assistant\n"

elif modality == "image":
prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
<|im_start|>assistant\n"

llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=32768)
stop_token_ids = None
return llm, prompt, stop_token_ids


# Fuyu
def run_fuyu(question):
def run_fuyu(question, modality):
assert modality == "image"

prompt = f"{question}\n"
llm = LLM(model="adept/fuyu-8b")
Expand All @@ -51,7 +73,8 @@ def run_fuyu(question):


# Phi-3-Vision
def run_phi3v(question):
def run_phi3v(question, modality):
assert modality == "image"

prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n" # noqa: E501
# Note: The default setting of max_num_seqs (256) and
Expand All @@ -70,7 +93,8 @@ def run_phi3v(question):


# PaliGemma
def run_paligemma(question):
def run_paligemma(question, modality):
assert modality == "image"

# PaliGemma has special prompt format for VQA
prompt = "caption en"
Expand All @@ -80,7 +104,8 @@ def run_paligemma(question):


# Chameleon
def run_chameleon(question):
def run_chameleon(question, modality):
assert modality == "image"

prompt = f"{question}<image>"
llm = LLM(model="facebook/chameleon-7b")
Expand All @@ -89,7 +114,8 @@ def run_chameleon(question):


# MiniCPM-V
def run_minicpmv(question):
def run_minicpmv(question, modality):
assert modality == "image"

# 2.0
# The official repo doesn't work yet, so we need to use a fork for now
Expand Down Expand Up @@ -129,7 +155,9 @@ def run_minicpmv(question):


# InternVL
def run_internvl(question):
def run_internvl(question, modality):
assert modality == "image"

model_name = "OpenGVLab/InternVL2-2B"

llm = LLM(
Expand All @@ -155,7 +183,8 @@ def run_internvl(question):


# BLIP-2
def run_blip2(question):
def run_blip2(question, modality):
assert modality == "image"

# BLIP-2 prompt format is inaccurate on HuggingFace model repository.
# See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
Expand All @@ -166,7 +195,8 @@ def run_blip2(question):


# Qwen
def run_qwen_vl(question):
def run_qwen_vl(question, modality):
assert modality == "image"

llm = LLM(
model="Qwen/Qwen-VL",
Expand All @@ -180,7 +210,9 @@ def run_qwen_vl(question):


# Qwen2-VL
def run_qwen2_vl(question):
def run_qwen2_vl(question, modality):
assert modality == "image"

model_name = "Qwen/Qwen2-VL-7B-Instruct"

llm = LLM(
Expand All @@ -200,6 +232,7 @@ def run_qwen2_vl(question):
"llava": run_llava,
"llava-next": run_llava_next,
"llava-next-video": run_llava_next_video,
"llava-onevision": run_llava_onevision,
"fuyu": run_fuyu,
"phi3_v": run_phi3v,
"paligemma": run_paligemma,
Expand Down Expand Up @@ -255,7 +288,7 @@ def main(args):
data = mm_input["data"]
question = mm_input["question"]

llm, prompt, stop_token_ids = model_example_map[model](question)
llm, prompt, stop_token_ids = model_example_map[model](question, modality)

# We set temperature to 0.2 so that outputs can be different
# even when all prompts are identical when running batch inference.
Expand Down Expand Up @@ -306,6 +339,7 @@ def main(args):
parser.add_argument('--modality',
type=str,
default="image",
choices=['image', 'video'],
help='Modality of the input.')
parser.add_argument('--num-frames',
type=int,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,6 @@ def run_test(
for asset in video_assets
]

for video in videos:
print(video.shape)

if size_factors is not None:
inputs_per_video = [(
[prompt for _ in size_factors],
Expand Down
Loading

0 comments on commit 99ee7a8

Please sign in to comment.