[Core][Frontend] Support Passing Multimodal Processor Kwargs (vllm-pr…

…oject#8657) Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com>
MengqingCao · Sep 30, 2024 · 0bbf5a6 · 0bbf5a6
1 parent 8578547
commit 0bbf5a6
Show file tree

Hide file tree

Showing 16 changed files with 589 additions and 116 deletions.
diff --git a/tests/engine/test_arg_utils.py b/tests/engine/test_arg_utils.py
@@ -40,3 +40,24 @@ def test_limit_mm_per_prompt_parser(arg, expected):
 def test_bad_nullable_kvs(arg):
     with pytest.raises(ArgumentTypeError):
         nullable_kvs(arg)
+
+
+@pytest.mark.parametrize(("arg", "expected"), [
+    (None, None),
+    ("{}", {}),
+    ('{"num_crops": 4}', {
+        "num_crops": 4
+    }),
+    ('{"foo": {"bar": "baz"}}', {
+        "foo": {
+            "bar": "baz"
+        }
+    }),
+])
+def test_mm_processor_kwargs_prompt_parser(arg, expected):
+    parser = EngineArgs.add_cli_args(FlexibleArgumentParser())
+    if arg is None:
+        args = parser.parse_args([])
+    else:
+        args = parser.parse_args(["--mm-processor-kwargs", arg])
+    assert args.mm_processor_kwargs == expected
diff --git a/tests/models/decoder_only/vision_language/test_qwen.py b/tests/models/decoder_only/vision_language/test_qwen.py
@@ -5,14 +5,13 @@
 import torch
 from PIL.Image import Image
 
-from vllm.config import ModelConfig
 from vllm.inputs import InputContext, LLMInputs
 from vllm.multimodal.base import MultiModalInputs
 from vllm.multimodal.utils import cached_get_tokenizer, rescale_image_size
 
 from ....conftest import (IMAGE_ASSETS, HfRunner, ImageAsset, PromptImageInput,
                           VllmRunner, _ImageAssets)
-from ...utils import check_logprobs_close
+from ...utils import build_model_context, check_logprobs_close
 
 text_only_models = [
     "Qwen/Qwen-7B-Chat"  # Has no visual component
@@ -42,32 +41,6 @@
 IMG_SIZE = 448
 
 
-def build_model_context(model_name: str,
-                        tokenizer_name: Optional[str] = None,
-                        trust_remote_code: bool = False):
-    """Creates an InputContext for a given model.
-    
-    Args:
-        model_name: Name of the model being considered.
-        tokenizer_name: Name of the tokenizer being considered.
-        trust_remote_code: Whether or not to allow loading remote code.
-
-    Returns:
-        InputContext for the model being considered.
-    """
-    if tokenizer_name is None:
-        tokenizer_name = model_name
-    model_config = ModelConfig(
-        model_name,
-        tokenizer_name,
-        tokenizer_mode="auto",
-        trust_remote_code=trust_remote_code,
-        dtype="float32",
-        seed=0,
-    )
-    return InputContext(model_config)
-
-
 @pytest.fixture()
 def input_mapper_for_qwen():
     # Lazy import to avoid initializing CUDA during test collection

diff --git a/tests/models/utils.py b/tests/models/utils.py
@@ -1,6 +1,8 @@
 import warnings
 from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+from vllm.config import ModelConfig
+from vllm.inputs import InputContext
 from vllm.sequence import Logprob, PromptLogprobs, SampleLogprobs
 
 TokensText = Tuple[List[int], str]
@@ -240,3 +242,36 @@ def check_logprobs_close(
                     warnings.simplefilter("always")
 
                     warnings.warn(fail_msg, stacklevel=2)
+
+
+def build_model_context(model_name: str,
+                        tokenizer_name: Optional[str] = None,
+                        trust_remote_code: bool = False,
+                        mm_processor_kwargs: Optional[Dict] = None,
+                        limit_mm_per_prompt: Optional[Dict] = None):
+    """Creates an InputContext for a given model.
+    
+    Args:
+        model_name: Name of the model being considered.
+        tokenizer_name: Name of the tokenizer being considered.
+        trust_remote_code: Whether or not to allow loading remote code.
+        mm_processor_kwargs: optional processor kwargs for to be leveraged
+            in the input processor, mapper, dummy data creation, etc.
+        limit_mm_per_prompt: Multimodal limits.
+
+    Returns:
+        InputContext for the model being considered.
+    """
+    if tokenizer_name is None:
+        tokenizer_name = model_name
+    model_config = ModelConfig(
+        model_name,
+        tokenizer_name,
+        tokenizer_mode="auto",
+        trust_remote_code=trust_remote_code,
+        dtype="float32",
+        seed=0,
+        mm_processor_kwargs=mm_processor_kwargs,
+        limit_mm_per_prompt=limit_mm_per_prompt,
+    )
+    return InputContext(model_config)