From a199c27edb68a53f2ceac3589a1399973ccf5f17 Mon Sep 17 00:00:00 2001 From: Cyrus Leung Date: Sat, 29 Jun 2024 23:45:54 +0800 Subject: [PATCH] [CI/Build] Add TP test for vision models (#5892) --- .buildkite/test-pipeline.yaml | 5 ++ .../distributed/test_multimodal_broadcast.py | 51 +++++++++++++++++++ tests/models/test_llava.py | 39 +++++++++++--- tests/models/test_phi3v.py | 49 +++++++++++++----- .../device_communicators/shm_broadcast.py | 1 + vllm/distributed/parallel_state.py | 4 +- vllm/model_executor/models/llava.py | 2 +- vllm/model_executor/models/llava_next.py | 2 +- vllm/model_executor/models/phi3v.py | 5 +- 9 files changed, 131 insertions(+), 27 deletions(-) create mode 100644 tests/distributed/test_multimodal_broadcast.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 023696f3cea9c..0a0bb55675458 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -44,6 +44,7 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + - bash ../.buildkite/download-images.sh # FIXIT: find out which code initialize cuda before running the test # before the fix, we need to use spawn to test it - export VLLM_WORKER_MULTIPROC_METHOD=spawn @@ -52,10 +53,14 @@ steps: - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py + - TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py + - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py diff --git a/tests/distributed/test_multimodal_broadcast.py b/tests/distributed/test_multimodal_broadcast.py new file mode 100644 index 0000000000000..41c3fd9e7f6b3 --- /dev/null +++ b/tests/distributed/test_multimodal_broadcast.py @@ -0,0 +1,51 @@ +"""Compare the outputs of HF and distributed vLLM when using greedy sampling. +The second test will hang if more than one test is run per command, so we need +to run the tests one by one. The solution is to pass arguments (model name) by +environment variables. + +Run: +```sh +TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf \ + test_multimodal_broadcast.py +TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct \ + test_multimodal_broadcast.py +``` +""" +import os + +import pytest + +from vllm.utils import cuda_device_count_stateless + +model = os.environ["TEST_DIST_MODEL"] + +if model.startswith("llava-hf/llava"): + from ..models.test_llava import model_and_vl_config, run_test +elif model.startswith("microsoft/Phi-3-vision"): + from ..models.test_phi3v import model_and_vl_config, run_test +else: + raise NotImplementedError(f"Unsupported model: {model}") + + +@pytest.mark.parametrize("tensor_parallel_size", [2]) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models(hf_runner, vllm_runner, image_assets, + tensor_parallel_size: int, dtype: str, + max_tokens: int) -> None: + if cuda_device_count_stateless() < tensor_parallel_size: + pytest.skip( + f"Need at least {tensor_parallel_size} GPUs to run the test.") + + distributed_executor_backend = os.getenv("DISTRIBUTED_EXECUTOR_BACKEND") + + run_test( + hf_runner, + vllm_runner, + image_assets, + model_and_config=model_and_vl_config[0], + dtype=dtype, + max_tokens=max_tokens, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + ) diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py index ac1d2ece62b26..f2dfd4bb8596f 100644 --- a/tests/models/test_llava.py +++ b/tests/models/test_llava.py @@ -1,11 +1,11 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple, Type import pytest from transformers import AutoTokenizer from vllm.config import VisionLanguageConfig -from ..conftest import IMAGE_ASSETS +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets pytestmark = pytest.mark.vlm @@ -65,12 +65,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], return hf_output_ids, hf_output_str -# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] -@pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, image_assets, model_and_config, - dtype: str, max_tokens: int) -> None: +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model_and_config: Tuple[str, VisionLanguageConfig], + *, + dtype: str, + max_tokens: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -96,6 +101,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, with vllm_runner(model_id, dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, enforce_eager=True, **vlm_config.as_cli_args_dict()) as vllm_model: vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, @@ -110,3 +117,19 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") assert hf_output_ids == vllm_output_ids, ( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") + + +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model_and_config, + dtype=dtype, + max_tokens=max_tokens, + tensor_parallel_size=1, + ) diff --git a/tests/models/test_phi3v.py b/tests/models/test_phi3v.py index 03c1304668366..e7d5639494104 100644 --- a/tests/models/test_phi3v.py +++ b/tests/models/test_phi3v.py @@ -1,4 +1,4 @@ -from typing import List, Tuple +from typing import List, Optional, Tuple, Type import pytest from transformers import AutoTokenizer @@ -6,7 +6,7 @@ from vllm.config import VisionLanguageConfig from vllm.utils import is_cpu -from ..conftest import IMAGE_ASSETS +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets pytestmark = pytest.mark.vlm @@ -73,17 +73,17 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str], target_dtype = "bfloat16" -# TODO: Add test for `tensor_parallel_size` [ref: PR #3883] -# Since we use _attn_implementation="eager" for hf_runner, here is -# numeric difference for longer context and test can't pass -@pytest.mark.xfail( - reason="Inconsistent image processor being used due to lack " - "of support for dynamic image token replacement") -@pytest.mark.parametrize("model_and_config", model_and_vl_config) -@pytest.mark.parametrize("dtype", [target_dtype]) -@pytest.mark.parametrize("max_tokens", [128]) -def test_models(hf_runner, vllm_runner, image_assets, model_and_config, - dtype: str, max_tokens: int) -> None: +def run_test( + hf_runner: Type[HfRunner], + vllm_runner: Type[VllmRunner], + image_assets: _ImageAssets, + model_and_config: Tuple[str, VisionLanguageConfig], + *, + dtype: str, + max_tokens: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): """Inference result should be the same between hf and vllm. All the image fixtures for the test is under tests/images. @@ -116,7 +116,9 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, with vllm_runner(model_id, max_model_len=2048, dtype=dtype, + tensor_parallel_size=tensor_parallel_size, enforce_eager=True, + distributed_executor_backend=distributed_executor_backend, **vlm_config.as_cli_args_dict()) as vllm_model: vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts, max_tokens, @@ -130,3 +132,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config, f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") assert hf_output_ids == vllm_output_ids, ( f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") + + +# Since we use _attn_implementation="eager" for hf_runner, here is +# numeric difference for longer context and test can't pass +@pytest.mark.xfail( + reason="Inconsistent image processor being used due to lack " + "of support for dynamic image token replacement") +@pytest.mark.parametrize("model_and_config", model_and_vl_config) +@pytest.mark.parametrize("dtype", [target_dtype]) +@pytest.mark.parametrize("max_tokens", [128]) +def test_models(hf_runner, vllm_runner, image_assets, model_and_config, + dtype: str, max_tokens: int) -> None: + run_test( + hf_runner, + vllm_runner, + image_assets, + model_and_config, + dtype=dtype, + max_tokens=max_tokens, + tensor_parallel_size=1, + ) diff --git a/vllm/distributed/device_communicators/shm_broadcast.py b/vllm/distributed/device_communicators/shm_broadcast.py index 550271f881df5..bea205882d9d8 100644 --- a/vllm/distributed/device_communicators/shm_broadcast.py +++ b/vllm/distributed/device_communicators/shm_broadcast.py @@ -268,6 +268,7 @@ def broadcast_object(self, obj=None): else: return self.dequeue() + @staticmethod def create_from_process_group(pg: ProcessGroup, max_chunk_bytes, max_chunks, diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index 0c4ee0eb2c04c..4ebb8703e0f44 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -194,7 +194,7 @@ def __init__( self.shm_broadcaster: Optional[ShmRingBufferIO] = None if self.world_size > 1 and is_in_the_same_node(self.cpu_group): self.shm_broadcaster = ShmRingBufferIO.create_from_process_group( - self.cpu_group, 1 << 20, 6) + self.cpu_group, 1 << 22, 6) @property def first_rank(self): @@ -690,6 +690,8 @@ def destroy(self): self.pynccl_comm = None if self.ca_comm is not None: self.ca_comm = None + if self.shm_broadcaster is not None: + self.shm_broadcaster = None _WORLD: Optional[GroupCoordinator] = None diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index bdcb6331730ab..ba4496f9cfac5 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -219,7 +219,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower - image_features = vision_tower(pixel_values.to(vision_tower.device), + image_features = vision_tower(pixel_values, self.config.vision_feature_layer) return self._select_image_features( diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py index cebc828165ed7..2814310746715 100644 --- a/vllm/model_executor/models/llava_next.py +++ b/vllm/model_executor/models/llava_next.py @@ -301,7 +301,7 @@ def _image_pixels_to_features(self, vision_tower: CLIPVisionModel, # NOTE: we skip the step to select the vision feature layer since # this is already done inside the vision tower - image_features = vision_tower(pixel_values.to(vision_tower.device), + image_features = vision_tower(pixel_values, self.config.vision_feature_layer) return self._select_image_features( diff --git a/vllm/model_executor/models/phi3v.py b/vllm/model_executor/models/phi3v.py index 5d8ffd5215c52..bc3d3f0fbf194 100644 --- a/vllm/model_executor/models/phi3v.py +++ b/vllm/model_executor/models/phi3v.py @@ -157,7 +157,6 @@ def forward(self, input_ids: torch.LongTensor, select = False - target_device = self.img_projection[0].bias.device target_dtype = self.img_projection[0].bias.dtype if len(positions.tolist()) > 0: @@ -231,7 +230,7 @@ def forward(self, input_ids: torch.LongTensor, img_set_tensor = [] for _output_img in output_imgs: img_feature_proj = self.img_projection( - _output_img.to(target_device, target_dtype)) + _output_img.to(target_dtype)) img_set_tensor.append(img_feature_proj) select = True @@ -245,7 +244,7 @@ def forward(self, input_ids: torch.LongTensor, hidden_states[positions[idx, 0], positions[idx, 1]:positions[idx, 1] + cnt] = (img_set_tensor[i].to( - hidden_states.device, hidden_states.dtype)) + hidden_states.dtype)) idx += cnt return hidden_states.squeeze(0)