Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[core][refactor] Move accelerator-specific environment variables to ray_constants.py to avoid redefining them #51026

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions python/ray/_private/accelerators/amd_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
ROCR_VISIBLE_DEVICES_ENV_VAR,
NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR,
)

logger = logging.getLogger(__name__)

ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES"
NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"

amd_product_dict = {
"0x738c": "AMD-Instinct-MI100",
"0x7408": "AMD-Instinct-MI250X",
Expand Down
7 changes: 4 additions & 3 deletions python/ray/_private/accelerators/hpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
from importlib.util import find_spec

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
HABANA_VISIBLE_DEVICES_ENV_VAR,
NOSET_HABANA_VISIBLE_MODULES_ENV_VAR,
)

logger = logging.getLogger(__name__)

HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES"
NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"


@lru_cache()
def is_package_present(package_name: str) -> bool:
Expand Down
6 changes: 4 additions & 2 deletions python/ray/_private/accelerators/intel_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
ONEAPI_DEVICE_SELECTOR_ENV_VAR,
NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR,
)

logger = logging.getLogger(__name__)

ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR"
NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR"
ONEAPI_DEVICE_BACKEND_TYPE = "level_zero"
ONEAPI_DEVICE_TYPE = "gpu"

Expand Down
9 changes: 4 additions & 5 deletions python/ray/_private/accelerators/neuron.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,13 @@
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
NEURON_RT_VISIBLE_CORES_ENV_VAR,
NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR,
)

logger = logging.getLogger(__name__)

NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = (
"RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
)

# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/inf2-arch.html#aws-inf2-arch
# https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-hardware/trn1-arch.html#aws-trn1-arch
# Subject to removal after the information is available via public API
Expand Down
9 changes: 4 additions & 5 deletions python/ray/_private/accelerators/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,13 @@
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
ASCEND_RT_VISIBLE_DEVICES_ENV_VAR,
NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR,
)

logger = logging.getLogger(__name__)

ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = (
"RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"
)


class NPUAcceleratorManager(AcceleratorManager):
"""Ascend NPU accelerators."""
Expand Down
7 changes: 4 additions & 3 deletions python/ray/_private/accelerators/nvidia_gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from typing import Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
CUDA_VISIBLE_DEVICES_ENV_VAR,
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR,
)

logger = logging.getLogger(__name__)

CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"

# TODO(Alex): This pattern may not work for non NVIDIA Tesla GPUs (which have
# the form "Tesla V100-SXM2-16GB" or "Tesla K80").
NVIDIA_GPU_NAME_PATTERN = re.compile(r"\w+\s+([A-Z0-9]+)")
Expand Down
8 changes: 4 additions & 4 deletions python/ray/_private/accelerators/tpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
from typing import Dict, Optional, List, Tuple

from ray._private.accelerators.accelerator import AcceleratorManager
from ray._private.ray_constants import (
TPU_VISIBLE_CHIPS_ENV_VAR,
NOSET_TPU_VISIBLE_CHIPS_ENV_VAR,
)

logger = logging.getLogger(__name__)

Expand All @@ -28,10 +32,6 @@
GCE_TPU_INSTANCE_ID_KEY = "instance-id"
GCE_TPU_WORKER_ID_KEY = "agent-worker-number"

TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"

NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS"

# The following defines environment variables that allow
# us to access a subset of TPU visible chips.
#
Expand Down
17 changes: 15 additions & 2 deletions python/ray/_private/ray_constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -428,13 +428,26 @@ def env_set_by_user(key):
LANGUAGE_WORKER_TYPES = ["python", "java", "cpp"]

# Accelerator constants
NOSET_ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = (
"RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES"
)
NOSET_AWS_NEURON_RT_VISIBLE_CORES_ENV_VAR = (
"RAY_EXPERIMENTAL_NOSET_NEURON_RT_VISIBLE_CORES"
)
NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES"
NOSET_HABANA_VISIBLE_MODULES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_HABANA_VISIBLE_MODULES"
NOSET_ONEAPI_DEVICE_SELECTOR_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ONEAPI_DEVICE_SELECTOR"
NOSET_ROCR_VISIBLE_DEVICES_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES"
NOSET_TPU_VISIBLE_CHIPS_ENV_VAR = "RAY_EXPERIMENTAL_NOSET_TPU_VISIBLE_CHIPS"

ASCEND_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
CUDA_VISIBLE_DEVICES_ENV_VAR = "CUDA_VISIBLE_DEVICES"
ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES"
HABANA_VISIBLE_DEVICES_ENV_VAR = "HABANA_VISIBLE_MODULES"
NEURON_RT_VISIBLE_CORES_ENV_VAR = "NEURON_RT_VISIBLE_CORES"
ONEAPI_DEVICE_SELECTOR_ENV_VAR = "ONEAPI_DEVICE_SELECTOR"
ROCR_VISIBLE_DEVICES_ENV_VAR = "ROCR_VISIBLE_DEVICES"
TPU_VISIBLE_CHIPS_ENV_VAR = "TPU_VISIBLE_CHIPS"
NPU_RT_VISIBLE_DEVICES_ENV_VAR = "ASCEND_RT_VISIBLE_DEVICES"
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the same as ASCEND_RT_VISIBLE_DEVICES_ENV_VAR.



NEURON_CORES = "neuron_cores"
GPU = "GPU"
Expand Down
2 changes: 1 addition & 1 deletion python/ray/air/_internal/device_manager/npu.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def get_devices(self) -> List[torch.device]:

if len(npu_ids) > 0:
npu_visible_str = os.environ.get(
ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR, ""
ray_constants.ASCEND_RT_VISIBLE_DEVICES_ENV_VAR, ""
)
if npu_visible_str and npu_visible_str != "NoDevFiles":
npu_visible_list = npu_visible_str.split(",")
Expand Down
2 changes: 1 addition & 1 deletion python/ray/train/_internal/backend_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def __init__(
ResourceConfig(
ray_constants.NPU,
ENABLE_SHARE_NPU_RT_VISIBLE_DEVICES_ENV,
ray_constants.NPU_RT_VISIBLE_DEVICES_ENV_VAR,
ray_constants.ASCEND_RT_VISIBLE_DEVICES_ENV_VAR,
),
# For AMD GPUs, they are using ROCR_VISIBLE_DEVICES env var.
ResourceConfig(
Expand Down