Skip to content
This repository has been archived by the owner on Oct 11, 2024. It is now read-only.

Commit

Permalink
[distributed][misc] use fork by default for mp (vllm-project#5669)
Browse files Browse the repository at this point in the history
  • Loading branch information
youkaichao authored and robertgshaw2-neuralmagic committed Jun 23, 2024
1 parent c477239 commit 5ccb86c
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 3 deletions.
9 changes: 9 additions & 0 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,9 @@ steps:
working_dir: "/vllm-workspace/tests"
num_gpus: 2
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
- TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
- TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
Expand All @@ -55,6 +58,9 @@ steps:
working_dir: "/vllm-workspace/tests"
num_gpus: 4
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
Expand Down Expand Up @@ -145,6 +151,9 @@ steps:
num_gpus: 4
# This test runs llama 13B, so it is required to run on 4 GPUs.
commands:
# FIXIT: find out which code initialize cuda before running the test
# before the fix, we need to use spawn to test it
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py

- label: Tensorizer Test
Expand Down
28 changes: 27 additions & 1 deletion vllm/distributed/device_communicators/custom_all_reduce_utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import ctypes
import json
import os
import pickle
import subprocess
import sys
from itertools import product
from typing import Dict, List, Optional, Sequence

Expand Down Expand Up @@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:
ids = list(range(num_dev))
# batch of all pairs of GPUs
batch_src, batch_tgt = zip(*list(product(ids, ids)))
result = can_actually_p2p(batch_src, batch_tgt)
# NOTE: we use `subprocess` rather than `multiprocessing` here
# because the caller might not have `if __name__ == "__main__":`,
# in that case we cannot use spawn method in multiprocessing.
# However, `can_actually_p2p` requires spawn method.
# The fix is, we use `subprocess` to call the function,
# where we have `if __name__ == "__main__":` in this file.
input_bytes = pickle.dumps((batch_src, batch_tgt))
returned = subprocess.run([sys.executable, __file__],
input=input_bytes,
capture_output=True)
# check if the subprocess is successful
try:
returned.check_returncode()
except Exception as e:
# wrap raised exception to provide more information
raise RuntimeError(
f"Error happened when batch testing "
f"peer-to-peer access from {batch_src} to {batch_tgt}") from e
result = pickle.loads(returned.stdout)
for _i, _j, r in zip(batch_src, batch_tgt, result):
cache[f"{_i}->{_j}"] = r
with open(path, "w") as f:
Expand All @@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool:


__all__ = ["gpu_p2p_access_check"]

if __name__ == "__main__":
batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read())
result = can_actually_p2p(batch_src, batch_tgt)
sys.stdout.buffer.write(pickle.dumps(result))
4 changes: 2 additions & 2 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
VLLM_CPU_KVCACHE_SPACE: int = 0
VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/"
VLLM_USE_RAY_COMPILED_DAG: bool = False
VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
VLLM_WORKER_MULTIPROC_METHOD: str = "fork"
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None
Expand Down Expand Up @@ -212,7 +212,7 @@
# Use dedicated multiprocess context for workers.
# Both spawn and fork work
"VLLM_WORKER_MULTIPROC_METHOD":
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"),

# Timeout for fetching images when serving multimodal models
# Default is 5 seconds
Expand Down

0 comments on commit 5ccb86c

Please sign in to comment.