1
- from typing import List , Tuple
1
+ from typing import List , Optional , Tuple , Type
2
2
3
3
import pytest
4
4
from transformers import AutoTokenizer
5
5
6
6
from vllm .config import VisionLanguageConfig
7
7
from vllm .utils import is_cpu
8
8
9
- from ..conftest import IMAGE_ASSETS
9
+ from ..conftest import IMAGE_ASSETS , HfRunner , VllmRunner , _ImageAssets
10
10
11
11
pytestmark = pytest .mark .vlm
12
12
@@ -73,17 +73,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
73
73
target_dtype = "bfloat16"
74
74
75
75
76
- # TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
77
- # Since we use _attn_implementation="eager" for hf_runner, here is
78
- # numeric difference for longer context and test can't pass
79
- @ pytest . mark . xfail (
80
- reason = "Inconsistent image processor being used due to lack "
81
- "of support for dynamic image token replacement" )
82
- @ pytest . mark . parametrize ( "model_and_config" , model_and_vl_config )
83
- @ pytest . mark . parametrize ( "dtype" , [ target_dtype ])
84
- @ pytest . mark . parametrize ( "max_tokens" , [ 128 ])
85
- def test_models ( hf_runner , vllm_runner , image_assets , model_and_config ,
86
- dtype : str , max_tokens : int ) -> None :
76
+ def run_test (
77
+ hf_runner : Type [ HfRunner ],
78
+ vllm_runner : Type [ VllmRunner ],
79
+ image_assets : _ImageAssets ,
80
+ model_and_config : Tuple [ str , VisionLanguageConfig ],
81
+ * ,
82
+ dtype : str ,
83
+ max_tokens : int ,
84
+ tensor_parallel_size : int ,
85
+ distributed_executor_backend : Optional [ str ] = None ,
86
+ ) :
87
87
"""Inference result should be the same between hf and vllm.
88
88
89
89
All the image fixtures for the test is under tests/images.
@@ -116,7 +116,9 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
116
116
with vllm_runner (model_id ,
117
117
max_model_len = 2048 ,
118
118
dtype = dtype ,
119
+ tensor_parallel_size = tensor_parallel_size ,
119
120
enforce_eager = True ,
121
+ distributed_executor_backend = distributed_executor_backend ,
120
122
** vlm_config .as_cli_args_dict ()) as vllm_model :
121
123
vllm_outputs = vllm_model .generate_greedy (vllm_image_prompts ,
122
124
max_tokens ,
@@ -130,3 +132,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
130
132
f"Test{ i } :\n HF: { hf_output_str !r} \n vLLM: { vllm_output_str !r} " )
131
133
assert hf_output_ids == vllm_output_ids , (
132
134
f"Test{ i } :\n HF: { hf_output_ids } \n vLLM: { vllm_output_ids } " )
135
+
136
+
137
+ # Since we use _attn_implementation="eager" for hf_runner, here is
138
+ # numeric difference for longer context and test can't pass
139
+ @pytest .mark .xfail (
140
+ reason = "Inconsistent image processor being used due to lack "
141
+ "of support for dynamic image token replacement" )
142
+ @pytest .mark .parametrize ("model_and_config" , model_and_vl_config )
143
+ @pytest .mark .parametrize ("dtype" , [target_dtype ])
144
+ @pytest .mark .parametrize ("max_tokens" , [128 ])
145
+ def test_models (hf_runner , vllm_runner , image_assets , model_and_config ,
146
+ dtype : str , max_tokens : int ) -> None :
147
+ run_test (
148
+ hf_runner ,
149
+ vllm_runner ,
150
+ image_assets ,
151
+ model_and_config ,
152
+ dtype = dtype ,
153
+ max_tokens = max_tokens ,
154
+ tensor_parallel_size = 1 ,
155
+ )
0 commit comments