From a39a86a60fc7ffc15f65a30d186871ca078e8e1e Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Wed, 25 Sep 2024 20:23:05 +0000 Subject: [PATCH 1/3] [Core] Get multiprocessing context at runtime Instead of getting the multiprocessing context at import time, get it at runtime. This allows other code in vllm to change this env var and have it take effect here. Signed-off-by: Russell Bryant --- vllm/executor/multiproc_worker_utils.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/vllm/executor/multiproc_worker_utils.py b/vllm/executor/multiproc_worker_utils.py index 5bef76b90d332..e14ecc13a9dc0 100644 --- a/vllm/executor/multiproc_worker_utils.py +++ b/vllm/executor/multiproc_worker_utils.py @@ -27,9 +27,6 @@ JOIN_TIMEOUT_S = 2 -mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD -mp = multiprocessing.get_context(mp_method) - @dataclass class Result(Generic[T]): @@ -77,7 +74,7 @@ class ResultHandler(threading.Thread): def __init__(self) -> None: super().__init__(daemon=True) - self.result_queue = mp.Queue() + self.result_queue = get_mp_context().Queue() self.tasks: Dict[uuid.UUID, Union[ResultFuture, asyncio.Future]] = {} def run(self): @@ -147,10 +144,11 @@ class ProcessWorkerWrapper: def __init__(self, result_handler: ResultHandler, worker_factory: Callable[[], Any]) -> None: - self._task_queue = mp.Queue() + self.mp = get_mp_context() + self._task_queue = self.mp.Queue() self.result_queue = result_handler.result_queue self.tasks = result_handler.tasks - self.process: BaseProcess = mp.Process( # type: ignore[attr-defined] + self.process: BaseProcess = self.mp.Process( # type: ignore[attr-defined] target=_run_worker_process, name="VllmWorkerProcess", kwargs=dict( @@ -204,7 +202,7 @@ def _run_worker_process( """Worker process event loop""" # Add process-specific prefix to stdout and stderr - process_name = mp.current_process().name + process_name = get_mp_context().current_process().name pid = os.getpid() _add_prefix(sys.stdout, process_name, pid) _add_prefix(sys.stderr, process_name, pid) @@ -269,3 +267,8 @@ def write_with_prefix(s: str): file.start_new_line = True # type: ignore[attr-defined] file.write = write_with_prefix # type: ignore[method-assign] + + +def get_mp_context(): + mp_method = envs.VLLM_WORKER_MULTIPROC_METHOD + return multiprocessing.get_context(mp_method) From a522c71da005e47ce4fc4a6c304a73aaea11881a Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 24 Sep 2024 16:42:02 +0000 Subject: [PATCH 2/3] [Core] Always use `spawn` mp method from the CLI When using the CLI entry point, always default to using the `spawn` multprocessing method. The safest multiprocessing method is `spawn`, as the default `fork` method is not compatible with some accelerators. The default method will be changing in future versions of Python, so we should use it explicitly when possible. We only set it here in the CLI entrypoint, because changing to `spawn` could break some existing code using vLLM as a library since some configurations will work fine with the default of `fork`. `spawn` will cause unexpected behavior if the code is not protected by `if __name__ == "__main__":`. Signed-off-by: Russell Bryant --- vllm/scripts.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/vllm/scripts.py b/vllm/scripts.py index 231a18e99f3d7..7f2ba62695d3e 100644 --- a/vllm/scripts.py +++ b/vllm/scripts.py @@ -12,8 +12,11 @@ from vllm.engine.arg_utils import EngineArgs from vllm.entrypoints.openai.api_server import run_server from vllm.entrypoints.openai.cli_args import make_arg_parser +from vllm.logger import init_logger from vllm.utils import FlexibleArgumentParser +logger = init_logger(__name__) + def register_signal_handlers(): @@ -114,7 +117,30 @@ def _add_query_options( return parser +def env_setup(): + # The safest multiprocessing method is `spawn`, as the default `fork` method + # is not compatible with some accelerators. The default method will be + # changing in future versions of Python, so we should use it explicitly when + # possible. + # + # We only set it here in the CLI entrypoint, because changing to `spawn` + # could break some existing code using vLLM as a library. `spawn` will cause + # unexpected behavior if the code is not protected by + # `if __name__ == "__main__":`. + # + # References: + # - https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods + # - https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing + # - https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors + # - https://docs.habana.ai/en/latest/PyTorch/Getting_Started_with_PyTorch_and_Gaudi/Getting_Started_with_PyTorch.html?highlight=multiprocessing#torch-multiprocessing-for-dataloaders + if "VLLM_WORKER_MULTIPROC_METHOD" not in os.environ: + logger.debug("Setting VLLM_WORKER_MULTIPROC_METHOD to 'spawn'") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + def main(): + env_setup() + parser = FlexibleArgumentParser(description="vLLM CLI") subparsers = parser.add_subparsers(required=True) From 35105e726be3ca7066fa508df6698763de608eee Mon Sep 17 00:00:00 2001 From: Russell Bryant Date: Tue, 24 Sep 2024 23:40:33 +0000 Subject: [PATCH 3/3] [Core] Use `spawn` when cuda is already initialized One condition that we know will be broken with the default multiprocessing method of `fork` is if a user of vllm as a library initializes cuda prior to running vllm. This change detects this case, emits a warning to the log, and force sets the method to `spawn`. Similar code exists elsewhere (for AMD, Intel) to force the use of `spawn` in all cases for those accelerators. We retain the default behavior if the env var is not set and cuda is not initialized. This seems to work fine and avoids potentially breaking code using vllm as a library without protecting their code under `if __name__ == "__main__"`. Signed-off-by: Russell Bryant --- vllm/executor/multiproc_gpu_executor.py | 11 +++++++++-- vllm/utils.py | 7 +++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py index cc535e99a06ef..2dbde778e49b1 100644 --- a/vllm/executor/multiproc_gpu_executor.py +++ b/vllm/executor/multiproc_gpu_executor.py @@ -15,8 +15,8 @@ from vllm.sequence import ExecuteModelRequest from vllm.triton_utils import maybe_set_triton_cache_manager from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless, - get_distributed_init_method, get_open_port, - get_vllm_instance_id, make_async, + cuda_is_initialized, get_distributed_init_method, + get_open_port, get_vllm_instance_id, make_async, update_environment_variables) logger = init_logger(__name__) @@ -122,6 +122,13 @@ def _check_executor_parameters(self): "CUDA_VISIBLE_DEVICES": (",".join(map(str, range(world_size)))) }) + if (cuda_is_initialized() + and os.environ.get("VLLM_WORKER_MULTIPROC_METHOD") != "spawn"): + logger.warning("CUDA was previously initialized. We must use " + "the `spawn` multiprocessing start method. Setting " + "VLLM_WORKER_MULTIPROC_METHOD to 'spawn'.") + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + cuda_device_count = cuda_device_count_stateless() # Use confusing message for more common TP-only case. assert tensor_parallel_size <= cuda_device_count, ( diff --git a/vllm/utils.py b/vllm/utils.py index b73e3b9bbf68e..ed03b71a06cc5 100644 --- a/vllm/utils.py +++ b/vllm/utils.py @@ -1090,6 +1090,13 @@ def cuda_device_count_stateless() -> int: return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES) +def cuda_is_initialized() -> bool: + """Check if CUDA is initialized.""" + if not torch.cuda._is_compiled(): + return False + return torch.cuda.is_initialized() + + def weak_bind(bound_method: Callable[..., Any], ) -> Callable[..., None]: """Make an instance method that weakly references its associated instance and no-ops once that