From 6c5b7af1525a2013d7b1806dd6c0c9a53404be6d Mon Sep 17 00:00:00 2001 From: youkaichao Date: Thu, 20 Jun 2024 17:06:34 -0700 Subject: [PATCH] [distributed][misc] use fork by default for mp (#5669) --- .buildkite/test-pipeline.yaml | 9 ++++++ .../custom_all_reduce_utils.py | 28 ++++++++++++++++++- vllm/envs.py | 4 +-- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 95cd5b1989ee..5e92ba3c24f5 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -37,6 +37,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 2 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py @@ -55,6 +58,9 @@ steps: working_dir: "/vllm-workspace/tests" num_gpus: 4 commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s distributed/test_pynccl.py # We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here. # See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context. @@ -145,6 +151,9 @@ steps: num_gpus: 4 # This test runs llama 13B, so it is required to run on 4 GPUs. commands: + # FIXIT: find out which code initialize cuda before running the test + # before the fix, we need to use spawn to test it + - export VLLM_WORKER_MULTIPROC_METHOD=spawn - pytest -v -s -x lora/test_long_context.py - label: Tensorizer Test diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py index e0641a54c419..d3e41fa71067 100644 --- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py +++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py @@ -1,6 +1,9 @@ import ctypes import json import os +import pickle +import subprocess +import sys from itertools import product from typing import Dict, List, Optional, Sequence @@ -198,7 +201,25 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: ids = list(range(num_dev)) # batch of all pairs of GPUs batch_src, batch_tgt = zip(*list(product(ids, ids))) - result = can_actually_p2p(batch_src, batch_tgt) + # NOTE: we use `subprocess` rather than `multiprocessing` here + # because the caller might not have `if __name__ == "__main__":`, + # in that case we cannot use spawn method in multiprocessing. + # However, `can_actually_p2p` requires spawn method. + # The fix is, we use `subprocess` to call the function, + # where we have `if __name__ == "__main__":` in this file. + input_bytes = pickle.dumps((batch_src, batch_tgt)) + returned = subprocess.run([sys.executable, __file__], + input=input_bytes, + capture_output=True) + # check if the subprocess is successful + try: + returned.check_returncode() + except Exception as e: + # wrap raised exception to provide more information + raise RuntimeError( + f"Error happened when batch testing " + f"peer-to-peer access from {batch_src} to {batch_tgt}") from e + result = pickle.loads(returned.stdout) for _i, _j, r in zip(batch_src, batch_tgt, result): cache[f"{_i}->{_j}"] = r with open(path, "w") as f: @@ -213,3 +234,8 @@ def gpu_p2p_access_check(src: int, tgt: int) -> bool: __all__ = ["gpu_p2p_access_check"] + +if __name__ == "__main__": + batch_src, batch_tgt = pickle.loads(sys.stdin.buffer.read()) + result = can_actually_p2p(batch_src, batch_tgt) + sys.stdout.buffer.write(pickle.dumps(result)) diff --git a/vllm/envs.py b/vllm/envs.py index f03b69f4b886..ae2fcd0826fb 100644 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -29,7 +29,7 @@ VLLM_CPU_KVCACHE_SPACE: int = 0 VLLM_XLA_CACHE_PATH: str = "~/.vllm/xla_cache/" VLLM_USE_RAY_COMPILED_DAG: bool = False - VLLM_WORKER_MULTIPROC_METHOD: str = "spawn" + VLLM_WORKER_MULTIPROC_METHOD: str = "fork" VLLM_IMAGE_FETCH_TIMEOUT: int = 5 VLLM_TARGET_DEVICE: str = "cuda" MAX_JOBS: Optional[str] = None @@ -212,7 +212,7 @@ # Use dedicated multiprocess context for workers. # Both spawn and fork work "VLLM_WORKER_MULTIPROC_METHOD": - lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"), + lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "fork"), # Timeout for fetching images when serving multimodal models # Default is 5 seconds