Skip to content

Commit 66f1d6c

Browse files
youkaichaojimpang
authored and
jimpang
committed
[ci][distributed] fix device count call
[ci][distributed] fix some cuda init that makes it necessary to use spawn (vllm-project#5991)
1 parent 28b2d9f commit 66f1d6c

6 files changed

+83
-51
lines changed

.buildkite/test-pipeline.yaml

+1-11
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,6 @@ steps:
4545
num_gpus: 2
4646
commands:
4747
- bash ../.buildkite/download-images.sh
48-
# FIXIT: find out which code initialize cuda before running the test
49-
# before the fix, we need to use spawn to test it
50-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
5148
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
5249
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
5350
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
@@ -60,8 +57,7 @@ steps:
6057
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
6158
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
6259
- TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
63-
# FIXIT: find out why TP is failing with mp backend on phi3-v
64-
# - TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
60+
- TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
6561
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
6662
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
6763
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
@@ -71,9 +67,6 @@ steps:
7167
working_dir: "/vllm-workspace/tests"
7268
num_gpus: 4
7369
commands:
74-
# FIXIT: find out which code initialize cuda before running the test
75-
# before the fix, we need to use spawn to test it
76-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
7770
- pytest -v -s distributed/test_pynccl.py
7871
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
7972
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
@@ -225,9 +218,6 @@ steps:
225218
gpu: a100
226219
num_gpus: 4
227220
commands:
228-
# FIXIT: find out which code initialize cuda before running the test
229-
# before the fix, we need to use spawn to test it
230-
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
231221
# NOTE: don't test llama model here, it seems hf implementation is buggy
232222
# see https://github.com/vllm-project/vllm/pull/5689 for details
233223
- pytest -v -s distributed/test_custom_all_reduce.py

tests/conftest.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -5,25 +5,29 @@
55
from dataclasses import dataclass
66
from functools import cached_property
77
from pathlib import Path
8-
from typing import (Any, Dict, List, Literal, Optional, Tuple, TypedDict,
9-
TypeVar)
8+
from typing import (TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple,
9+
TypedDict, TypeVar)
1010

1111
import pytest
1212
import torch
1313
import torch.nn as nn
1414
import torch.nn.functional as F
1515
from PIL import Image
1616
from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
17-
AutoProcessor, AutoTokenizer, BatchEncoding)
17+
AutoTokenizer, BatchEncoding)
1818

1919
from vllm import LLM, SamplingParams
2020
from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
2121
from vllm.distributed import (destroy_distributed_environment,
2222
destroy_model_parallel)
2323
from vllm.inputs import TextPrompt
2424
from vllm.logger import init_logger
25-
from vllm.multimodal import MultiModalData
26-
from vllm.multimodal.image import ImageFeatureData, ImagePixelData
25+
26+
if TYPE_CHECKING:
27+
from vllm.multimodal import MultiModalData
28+
else:
29+
# it will call torch.cuda.device_count()
30+
MultiModalData = None
2731
from vllm.sequence import SampleLogprobs
2832
from vllm.utils import cuda_device_count_stateless, is_cpu
2933

@@ -63,6 +67,10 @@ def for_hf(self) -> Image.Image:
6367
return self.pil_image
6468

6569
def for_vllm(self, vision_config: VisionLanguageConfig) -> MultiModalData:
70+
# don't put this import at the top level
71+
# it will call torch.cuda.device_count()
72+
from vllm.multimodal.image import ImageFeatureData # noqa: F401
73+
from vllm.multimodal.image import ImagePixelData
6674
image_input_type = vision_config.image_input_type
6775
ImageInputType = VisionLanguageConfig.ImageInputType
6876

@@ -216,6 +224,9 @@ def __init__(
216224
)
217225

218226
try:
227+
# don't put this import at the top level
228+
# it will call torch.cuda.device_count()
229+
from transformers import AutoProcessor # noqa: F401
219230
self.processor = AutoProcessor.from_pretrained(
220231
model_name,
221232
torch_dtype=torch_dtype,

tests/distributed/test_basic_distributed_correctness.py

+10-5
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import os
1616

1717
import pytest
18-
import torch
18+
19+
from vllm.utils import cuda_device_count_stateless
1920

2021
from ..models.utils import check_outputs_equal
2122

@@ -25,7 +26,7 @@
2526
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
2627

2728

28-
@pytest.mark.skipif(torch.cuda.device_count() < 2,
29+
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
2930
reason="Need at least 2 GPUs to run the test.")
3031
@pytest.mark.parametrize("model", MODELS)
3132
@pytest.mark.parametrize("dtype", ["half"])
@@ -40,16 +41,20 @@ def test_models(
4041
) -> None:
4142
distributed_executor_backend = os.getenv(DISTRIBUTED_EXECUTOR_BACKEND)
4243

43-
with hf_runner(model, dtype=dtype) as hf_model:
44-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
45-
44+
# NOTE: take care of the order. run vLLM first, and then run HF.
45+
# vLLM needs a fresh new process without cuda initialization.
46+
# if we run HF first, the cuda initialization will be done and it
47+
# will hurt multiprocessing backend with fork method (the default method).
4648
with vllm_runner(model,
4749
dtype=dtype,
4850
tensor_parallel_size=2,
4951
distributed_executor_backend=distributed_executor_backend
5052
) as vllm_model:
5153
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
5254

55+
with hf_runner(model, dtype=dtype) as hf_model:
56+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
57+
5358
check_outputs_equal(
5459
outputs_0_lst=hf_outputs,
5560
outputs_1_lst=vllm_outputs,

tests/distributed/test_chunked_prefill_distributed.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
import os
1515

1616
import pytest
17-
import torch
17+
18+
from vllm.utils import cuda_device_count_stateless
1819

1920
from ..models.utils import check_outputs_equal
2021

@@ -24,7 +25,7 @@
2425
DISTRIBUTED_EXECUTOR_BACKEND = "DISTRIBUTED_EXECUTOR_BACKEND"
2526

2627

27-
@pytest.mark.skipif(torch.cuda.device_count() < 2,
28+
@pytest.mark.skipif(cuda_device_count_stateless() < 2,
2829
reason="Need at least 2 GPUs to run the test.")
2930
@pytest.mark.parametrize("model", MODELS)
3031
@pytest.mark.parametrize("dtype", ["half"])
@@ -47,8 +48,10 @@ def test_models(
4748
enable_chunked_prefill = True
4849
max_num_batched_tokens = chunked_prefill_token_size
4950

50-
with hf_runner(model, dtype=dtype) as hf_model:
51-
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
51+
# NOTE: take care of the order. run vLLM first, and then run HF.
52+
# vLLM needs a fresh new process without cuda initialization.
53+
# if we run HF first, the cuda initialization will be done and it
54+
# will hurt multiprocessing backend with fork method (the default method).
5255

5356
with vllm_runner(
5457
model,
@@ -61,6 +64,9 @@ def test_models(
6164
) as vllm_model:
6265
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
6366

67+
with hf_runner(model, dtype=dtype) as hf_model:
68+
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
69+
6470
check_outputs_equal(
6571
outputs_0_lst=hf_outputs,
6672
outputs_1_lst=vllm_outputs,

tests/models/test_llava.py

+20-10
Original file line numberDiff line numberDiff line change
@@ -88,28 +88,38 @@ def run_test(
8888
"""
8989
model_id, vlm_config = model_and_config
9090
hf_images = [asset.for_hf() for asset in image_assets]
91-
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
9291

93-
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
94-
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
95-
max_tokens,
96-
images=hf_images)
97-
98-
vllm_image_prompts = [
99-
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
100-
for p in HF_IMAGE_PROMPTS
101-
]
92+
# NOTE: take care of the order. run vLLM first, and then run HF.
93+
# vLLM needs a fresh new process without cuda initialization.
94+
# if we run HF first, the cuda initialization will be done and it
95+
# will hurt multiprocessing backend with fork method (the default method).
10296

10397
with vllm_runner(model_id,
10498
dtype=dtype,
10599
tensor_parallel_size=tensor_parallel_size,
106100
distributed_executor_backend=distributed_executor_backend,
107101
enforce_eager=True,
108102
**vlm_config.as_cli_args_dict()) as vllm_model:
103+
104+
# NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
105+
# we must put it inside the vllm_runner context manager
106+
# i.e. after creating vLLM instance.
107+
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
108+
109+
vllm_image_prompts = [
110+
p.replace("<image>", "<image>" * vlm_config.image_feature_size)
111+
for p in HF_IMAGE_PROMPTS
112+
]
113+
109114
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
110115
max_tokens,
111116
images=vllm_images)
112117

118+
with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
119+
hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
120+
max_tokens,
121+
images=hf_images)
122+
113123
check_outputs_equal(
114124
hf_outputs,
115125
[

tests/models/test_phi3v.py

+26-16
Original file line numberDiff line numberDiff line change
@@ -96,23 +96,11 @@ def run_test(
9696
"""
9797
model_id, vlm_config = model_and_config
9898
hf_images = [asset.for_hf() for asset in image_assets]
99-
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
10099

101-
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
102-
hf_model_kwargs = {"_attn_implementation": "eager"}
103-
with hf_runner(model_id, dtype=dtype,
104-
model_kwargs=hf_model_kwargs) as hf_model:
105-
hf_outputs = hf_model.generate_greedy(
106-
HF_IMAGE_PROMPTS,
107-
max_tokens,
108-
images=hf_images,
109-
eos_token_id=hf_model.processor.tokenizer.eos_token_id)
110-
111-
vllm_image_prompts = [
112-
p.replace("<|image_1|>",
113-
"<|image|>" * vlm_config.image_feature_size + "<s>")
114-
for p in HF_IMAGE_PROMPTS
115-
]
100+
# NOTE: take care of the order. run vLLM first, and then run HF.
101+
# vLLM needs a fresh new process without cuda initialization.
102+
# if we run HF first, the cuda initialization will be done and it
103+
# will hurt multiprocessing backend with fork method (the default method).
116104

117105
with vllm_runner(model_id,
118106
max_model_len=2048,
@@ -121,10 +109,32 @@ def run_test(
121109
enforce_eager=True,
122110
distributed_executor_backend=distributed_executor_backend,
123111
**vlm_config.as_cli_args_dict()) as vllm_model:
112+
# NOTE: `asset.for_vllm` will call `torch.cuda.device_count()`
113+
# we must put it inside the vllm_runner context manager
114+
# i.e. after creating vLLM instance.
115+
116+
vllm_images = [asset.for_vllm(vlm_config) for asset in image_assets]
117+
118+
vllm_image_prompts = [
119+
p.replace("<|image_1|>",
120+
"<|image|>" * vlm_config.image_feature_size + "<s>")
121+
for p in HF_IMAGE_PROMPTS
122+
]
123+
124124
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
125125
max_tokens,
126126
images=vllm_images)
127127

128+
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
129+
hf_model_kwargs = {"_attn_implementation": "eager"}
130+
with hf_runner(model_id, dtype=dtype,
131+
model_kwargs=hf_model_kwargs) as hf_model:
132+
hf_outputs = hf_model.generate_greedy(
133+
HF_IMAGE_PROMPTS,
134+
max_tokens,
135+
images=hf_images,
136+
eos_token_id=hf_model.processor.tokenizer.eos_token_id)
137+
128138
check_outputs_equal(
129139
hf_outputs,
130140
[

0 commit comments

Comments
 (0)