[cuda][misc] remove error_on_invalid_device_count_status (vllm-projec…

…t#7069)
neuralmagic · Aug 17, 2024 · fbcb286 · fbcb286
1 parent 95119ad
commit fbcb286
Show file tree

Hide file tree

Showing 3 changed files with 3 additions and 32 deletions.
diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
@@ -17,7 +17,6 @@
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.triton_utils import maybe_set_triton_cache_manager
 from vllm.utils import (_run_task_with_lock, cuda_device_count_stateless,
-                        error_on_invalid_device_count_status,
                         get_distributed_init_method, get_open_port,
                         get_vllm_instance_id, make_async,
                         update_environment_variables)
@@ -79,8 +78,6 @@ def _init_executor(self) -> None:
             f"please ensure that world_size ({world_size}) "
             f"is less than than max local gpu count ({cuda_device_count})")
 
-        error_on_invalid_device_count_status()
-
         # Multiprocessing-based executor does not support multi-node setting.
         # Since it only works for single node, we can use the loopback address
         # 127.0.0.1 for communication.

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
@@ -10,10 +10,9 @@
 from vllm.executor.ray_utils import RayWorkerWrapper, ray
 from vllm.logger import init_logger
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
-from vllm.utils import (_run_task_with_lock,
-                        error_on_invalid_device_count_status,
-                        get_distributed_init_method, get_ip, get_open_port,
-                        get_vllm_instance_id, make_async)
+from vllm.utils import (_run_task_with_lock, get_distributed_init_method,
+                        get_ip, get_open_port, get_vllm_instance_id,
+                        make_async)
 
 if ray is not None:
     from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
@@ -216,8 +215,6 @@ def _init_workers_ray(self, placement_group: "PlacementGroup",
         distributed_init_method = get_distributed_init_method(
             driver_ip, get_open_port())
 
-        error_on_invalid_device_count_status()
-
         # Initialize the actual workers inside worker wrapper.
         init_worker_all_kwargs = [
             self._get_worker_kwargs(

diff --git a/vllm/utils.py b/vllm/utils.py
@@ -1,6 +1,5 @@
 import argparse
 import asyncio
-import contextlib
 import datetime
 import enum
 import gc
@@ -923,28 +922,6 @@ def cuda_device_count_stateless() -> int:
     return _cuda_device_count_stateless(envs.CUDA_VISIBLE_DEVICES)
 
 
-def error_on_invalid_device_count_status():
-    cache_entries = 0
-    with contextlib.suppress(Exception):
-        # future pytorch will fix the issue, device_count will not be cached
-        # at that time, `.cache_info().currsize` will error out
-        cache_entries = torch.cuda.device_count.cache_info(  # type: ignore
-        ).currsize
-    if cache_entries != 0:
-        # the function is already called, and the result is cached
-        remembered = torch.cuda.device_count()
-        current = cuda_device_count_stateless()
-        if remembered > current:
-            raise RuntimeError(
-                "The number of CUDA devices has changed since the first "
-                "call to torch.cuda.device_count(). This is not allowed "
-                "and may result in undefined behavior. Please check out "
-                "https://github.com/vllm-project/vllm/issues/6056 to "
-                "find the first call to torch.cuda.device_count() "
-                "and defer it until the engine is up. Or you can set "
-                "CUDA_VISIBLE_DEVICES to the GPUs you want to use.")
-
-
 # NVML utils
 # Note that NVML is not affected by `CUDA_VISIBLE_DEVICES`,
 # all the related functions work on real physical device ids.