From ceb8a0dec0934ccc092b68b3bf7873ed743c9357 Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Mon, 3 Mar 2025 00:45:14 +0000 Subject: [PATCH 1/3] update Signed-off-by: Kai-Hsun Chen --- python/ray/_private/accelerators/amd_gpu.py | 7 ++++--- python/ray/_private/accelerators/hpu.py | 3 --- python/ray/_private/accelerators/intel_gpu.py | 6 ++++-- python/ray/_private/accelerators/neuron.py | 9 ++++----- python/ray/_private/accelerators/npu.py | 9 ++++----- python/ray/_private/accelerators/nvidia_gpu.py | 7 ++++--- python/ray/_private/accelerators/tpu.py | 8 ++++---- python/ray/_private/ray_constants.py | 15 +++++++++++++-- python/ray/air/_internal/device_manager/npu.py | 2 +- python/ray/train/_internal/backend_executor.py | 2 +- 10 files changed, 39 insertions(+), 29 deletions(-) diff --git a/python/ray/_private/accelerators/amd_gpu.py b/python/ray/_private/accelerators/amd_gpu.py index dcfe8f9b55775..ab37c1dee2888 100644 --- a/python/ray/_private/accelerators/amd_gpu.py +++ b/python/ray/_private/accelerators/amd_gpu.py @@ -3,12 +3,13 @@ from typing import Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + ROCR_VISIBLE_DEVICES_ENV_VAR, + NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR, +) logger = logging.getLogger(__name__) -ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES" -NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES" - amd_product_dict = { "0x738c": "AMD-Instinct-MI100", "0x7408": "AMD-Instinct-MI250X", diff --git a/python/ray/_private/accelerators/hpu.py b/python/ray/_private/accelerators/hpu.py index 87bae0a9267ee..c77f96a714cb3 100644 --- a/python/ray/_private/accelerators/hpu.py +++ b/python/ray/_private/accelerators/hpu.py @@ -8,9 +8,6 @@ logger = logging.getLogger(__name__) -HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES" -NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES" - @lru_cache() def is_package_present(package_name: str) -> bool: diff --git a/python/ray/_private/accelerators/intel_gpu.py b/python/ray/_private/accelerators/intel_gpu.py index bd6f1c0fcbb14..28e0ebb5927eb 100644 --- a/python/ray/_private/accelerators/intel_gpu.py +++ b/python/ray/_private/accelerators/intel_gpu.py @@ -3,11 +3,13 @@ from typing import Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + ONEAPI_DEVICE_SELECTOR_ENV_VAR, + NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR, +) logger = logging.getLogger(__name__) -ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR" -NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR" ONEAPI_DEVICE_BACKEND_TYPE = "level_zero" ONEAPI_DEVICE_TYPE = "gpu" diff --git a/python/ray/_private/accelerators/neuron.py b/python/ray/_private/accelerators/neuron.py index 7ba9eeb0666b0..141bd0ddd561b 100644 --- a/python/ray/_private/accelerators/neuron.py +++ b/python/ray/_private/accelerators/neuron.py @@ -6,14 +6,13 @@ from typing import Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + NEURON_RT_VISIBLE_CORES_ENV_VAR, + NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR, +) logger = logging.getLogger(__name__) -NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" -NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = ( - "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES" -) - # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch # https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch # Subject to removal after the information is available via public API diff --git a/python/ray/_private/accelerators/npu.py b/python/ray/_private/accelerators/npu.py index d98434cd302ae..46eface6eac6b 100644 --- a/python/ray/_private/accelerators/npu.py +++ b/python/ray/_private/accelerators/npu.py @@ -4,14 +4,13 @@ from typing import Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, + NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, +) logger = logging.getLogger(__name__) -ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" -NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = ( - "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES" -) - class NPUAcceleratorManager(AcceleratorManager): """Ascend NPU accelerators.""" diff --git a/python/ray/_private/accelerators/nvidia_gpu.py b/python/ray/_private/accelerators/nvidia_gpu.py index 2eaafb5a6e06e..e6137e13f8d74 100644 --- a/python/ray/_private/accelerators/nvidia_gpu.py +++ b/python/ray/_private/accelerators/nvidia_gpu.py @@ -4,12 +4,13 @@ from typing import Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + CUDA_VISIBLE_DEVICES_ENV_VAR, + NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR, +) logger = logging.getLogger(__name__) -CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" -NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" - # TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have # the form "Tesla V100-SXM2-16GB" or "Tesla K80"). NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)") diff --git a/python/ray/_private/accelerators/tpu.py b/python/ray/_private/accelerators/tpu.py index 1349606e8ad3f..8f4450b12f9ce 100644 --- a/python/ray/_private/accelerators/tpu.py +++ b/python/ray/_private/accelerators/tpu.py @@ -7,6 +7,10 @@ from typing import Dict, Optional, List, Tuple from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + TPU_VISIBLE_CHIPS_ENV_VAR, + NOSET_TPU_VISIBLE_CHIPS_ENV_VAR, +) logger = logging.getLogger(__name__) @@ -28,10 +32,6 @@ GCE_TPU_INSTANCE_ID_KEY = "instance-id" GCE_TPU_WORKER_ID_KEY = "agent-worker-number" -TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" - -NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS" - # The following defines environment variables that allow # us to access a subset of TPU visible chips. # diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 859d784079dc3..38451e9b32334 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -428,13 +428,24 @@ def env_set_by_user(key): LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"] # Accelerator constants +NOSET_ASCEND_RT_VISIBLE_DEVICES = "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES" +NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = ( + "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES" +) NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" +NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES" +NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR" +NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES" +NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS" +ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES" -ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES" +HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES" NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES" +ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR" +ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES" TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS" -NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES" + NEURON_CORES = "neuron_cores" GPU = "GPU" diff --git a/python/ray/air/_internal/device_manager/npu.py b/python/ray/air/_internal/device_manager/npu.py index aa6d7bad24081..bd48c6cc793c7 100644 --- a/python/ray/air/_internal/device_manager/npu.py +++ b/python/ray/air/_internal/device_manager/npu.py @@ -55,7 +55,7 @@ def get_devices(self) -> List[torch.device]: if len(npu_ids) > 0: npu_visible_str = os.environ.get( - ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, "" + ray_constants.ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, "" ) if npu_visible_str and npu_visible_str != "NoDevFiles": npu_visible_list = npu_visible_str.split(",") diff --git a/python/ray/train/_internal/backend_executor.py b/python/ray/train/_internal/backend_executor.py index 3815f31add409..60cd8a1a55b98 100644 --- a/python/ray/train/_internal/backend_executor.py +++ b/python/ray/train/_internal/backend_executor.py @@ -123,7 +123,7 @@ def __init__( ResourceConfig( ray_constants.NPU, ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV, - ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, + ray_constants.ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, ), # For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var. ResourceConfig( From c31837f1aab1739fd488ad87b68998bc4ede33df Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Mon, 3 Mar 2025 00:47:45 +0000 Subject: [PATCH 2/3] update Signed-off-by: Kai-Hsun Chen --- python/ray/_private/accelerators/hpu.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/ray/_private/accelerators/hpu.py b/python/ray/_private/accelerators/hpu.py index c77f96a714cb3..f5ec5866d08b4 100644 --- a/python/ray/_private/accelerators/hpu.py +++ b/python/ray/_private/accelerators/hpu.py @@ -5,6 +5,10 @@ from importlib.util import find_spec from ray._private.accelerators.accelerator import AcceleratorManager +from ray._private.ray_constants import ( + HABANA_VISIBLE_DEVICES_ENV_VAR, + NOSET_HABANA_VISIBLE_MODULES_ENV_VAR, +) logger = logging.getLogger(__name__) From 00f51a2ae5c55ef74340cbb480c14e3ab74b56b5 Mon Sep 17 00:00:00 2001 From: Kai-Hsun Chen Date: Mon, 3 Mar 2025 00:50:19 +0000 Subject: [PATCH 3/3] update Signed-off-by: Kai-Hsun Chen --- python/ray/_private/ray_constants.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/ray/_private/ray_constants.py b/python/ray/_private/ray_constants.py index 38451e9b32334..f76549915d289 100644 --- a/python/ray/_private/ray_constants.py +++ b/python/ray/_private/ray_constants.py @@ -428,7 +428,9 @@ def env_set_by_user(key): LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"] # Accelerator constants -NOSET_ASCEND_RT_VISIBLE_DEVICES = "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES" +NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = ( + "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES" +) NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = ( "RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES" )