[Model][VLM] Add LLaVA-Onevision model support (vllm-project#8486)

Co-authored-by: litianjian <litianjian@bytedance.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Amit Garg <mitgarg17495@gmail.com>
garg-amit · Oct 28, 2024 · 99ee7a8 · 99ee7a8
1 parent 219326d
commit 99ee7a8
Show file tree

Hide file tree

Showing 10 changed files with 1,330 additions and 21 deletions.
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
@@ -244,6 +244,11 @@ Multimodal Language Models
     - Video
     - :code:`llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. (see note)
     -
+  * - :code:`LlavaOnevisionForConditionalGeneration`
+    - LLaVA-Onevision
+    - Image\ :sup:`+` / Video
+    - :code:`llava-hf/llava-onevision-qwen2-7b-ov-hf`, :code:`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. (see note)
+    -
   * - :code:`MiniCPMV`
     - MiniCPM-V
     - Image\ :sup:`+`
@@ -288,7 +293,7 @@ Multimodal Language Models
   For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630
 
 .. note::
-  For :code:`LLaVA-NeXT-Video` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
+  For :code:`LLaVA-NeXT-Video`, :code:`LLaVA-Onevision` and :code:`Qwen2-VL`, the latest release of :code:`huggingface/transformers` doesn't work yet, so we need to use a developer version (:code:`21fac7abba2a37fae86106f87fcf9974fd1e3830`) for now.
   This can be installed by running the following command: 
 
   .. code-block:: bash

diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py
@@ -14,7 +14,8 @@
 
 
 # LLaVA-1.5
-def run_llava(question):
+def run_llava(question, modality):
+    assert modality == "image"
 
     prompt = f"USER: <image>\n{question}\nASSISTANT:"
 
@@ -24,7 +25,8 @@ def run_llava(question):
 
 
 # LLaVA-1.6/LLaVA-NeXT
-def run_llava_next(question):
+def run_llava_next(question, modality):
+    assert modality == "image"
 
     prompt = f"[INST] <image>\n{question} [/INST]"
     llm = LLM(model="llava-hf/llava-v1.6-mistral-7b-hf", max_model_len=8192)
@@ -34,15 +36,35 @@ def run_llava_next(question):
 
 # LlaVA-NeXT-Video
 # Currently only support for video input
-def run_llava_next_video(question):
+def run_llava_next_video(question, modality):
+    assert modality == "video"
+
     prompt = f"USER: <video>\n{question} ASSISTANT:"
     llm = LLM(model="llava-hf/LLaVA-NeXT-Video-7B-hf", max_model_len=8192)
     stop_token_ids = None
     return llm, prompt, stop_token_ids
 
 
+# LLaVA-OneVision
+def run_llava_onevision(question, modality):
+
+    if modality == "video":
+        prompt = f"<|im_start|>user <video>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    elif modality == "image":
+        prompt = f"<|im_start|>user <image>\n{question}<|im_end|> \
+        <|im_start|>assistant\n"
+
+    llm = LLM(model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
+              max_model_len=32768)
+    stop_token_ids = None
+    return llm, prompt, stop_token_ids
+
+
 # Fuyu
-def run_fuyu(question):
+def run_fuyu(question, modality):
+    assert modality == "image"
 
     prompt = f"{question}\n"
     llm = LLM(model="adept/fuyu-8b")
@@ -51,7 +73,8 @@ def run_fuyu(question):
 
 
 # Phi-3-Vision
-def run_phi3v(question):
+def run_phi3v(question, modality):
+    assert modality == "image"
 
     prompt = f"<|user|>\n<|image_1|>\n{question}<|end|>\n<|assistant|>\n"  # noqa: E501
     # Note: The default setting of max_num_seqs (256) and
@@ -70,7 +93,8 @@ def run_phi3v(question):
 
 
 # PaliGemma
-def run_paligemma(question):
+def run_paligemma(question, modality):
+    assert modality == "image"
 
     # PaliGemma has special prompt format for VQA
     prompt = "caption en"
@@ -80,7 +104,8 @@ def run_paligemma(question):
 
 
 # Chameleon
-def run_chameleon(question):
+def run_chameleon(question, modality):
+    assert modality == "image"
 
     prompt = f"{question}<image>"
     llm = LLM(model="facebook/chameleon-7b")
@@ -89,7 +114,8 @@ def run_chameleon(question):
 
 
 # MiniCPM-V
-def run_minicpmv(question):
+def run_minicpmv(question, modality):
+    assert modality == "image"
 
     # 2.0
     # The official repo doesn't work yet, so we need to use a fork for now
@@ -129,7 +155,9 @@ def run_minicpmv(question):
 
 
 # InternVL
-def run_internvl(question):
+def run_internvl(question, modality):
+    assert modality == "image"
+
     model_name = "OpenGVLab/InternVL2-2B"
 
     llm = LLM(
@@ -155,7 +183,8 @@ def run_internvl(question):
 
 
 # BLIP-2
-def run_blip2(question):
+def run_blip2(question, modality):
+    assert modality == "image"
 
     # BLIP-2 prompt format is inaccurate on HuggingFace model repository.
     # See https://huggingface.co/Salesforce/blip2-opt-2.7b/discussions/15#64ff02f3f8cf9e4f5b038262 #noqa
@@ -166,7 +195,8 @@ def run_blip2(question):
 
 
 # Qwen
-def run_qwen_vl(question):
+def run_qwen_vl(question, modality):
+    assert modality == "image"
 
     llm = LLM(
         model="Qwen/Qwen-VL",
@@ -180,7 +210,9 @@ def run_qwen_vl(question):
 
 
 # Qwen2-VL
-def run_qwen2_vl(question):
+def run_qwen2_vl(question, modality):
+    assert modality == "image"
+
     model_name = "Qwen/Qwen2-VL-7B-Instruct"
 
     llm = LLM(
@@ -200,6 +232,7 @@ def run_qwen2_vl(question):
     "llava": run_llava,
     "llava-next": run_llava_next,
     "llava-next-video": run_llava_next_video,
+    "llava-onevision": run_llava_onevision,
     "fuyu": run_fuyu,
     "phi3_v": run_phi3v,
     "paligemma": run_paligemma,
@@ -255,7 +288,7 @@ def main(args):
     data = mm_input["data"]
     question = mm_input["question"]
 
-    llm, prompt, stop_token_ids = model_example_map[model](question)
+    llm, prompt, stop_token_ids = model_example_map[model](question, modality)
 
     # We set temperature to 0.2 so that outputs can be different
     # even when all prompts are identical when running batch inference.
@@ -306,6 +339,7 @@ def main(args):
     parser.add_argument('--modality',
                         type=str,
                         default="image",
+                        choices=['image', 'video'],
                         help='Modality of the input.')
     parser.add_argument('--num-frames',
                         type=int,

diff --git a/tests/models/decoder_only/vision_language/test_llava_next_video.py b/tests/models/decoder_only/vision_language/test_llava_next_video.py
@@ -105,9 +105,6 @@ def run_test(
         for asset in video_assets
     ]
 
-    for video in videos:
-        print(video.shape)
-
     if size_factors is not None:
         inputs_per_video = [(
             [prompt for _ in size_factors],