Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RLlib] Add NPU and HPU support to RLlib #49535

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions rllib/algorithms/algorithm_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,6 +357,7 @@ def __init__(self, algo_class: Optional[type] = None):
self.num_learners = 0
self.num_gpus_per_learner = 0
self.num_cpus_per_learner = "auto"
self.custom_resources_per_learner = {}
self.num_aggregator_actors_per_learner = 0
self.max_requests_in_flight_per_aggregator_actor = 3
self.local_gpu_idx = 0
Expand Down Expand Up @@ -2138,6 +2139,9 @@ def learners(
num_learners: Optional[int] = NotProvided,
num_cpus_per_learner: Optional[Union[str, float, int]] = NotProvided,
num_gpus_per_learner: Optional[Union[float, int]] = NotProvided,
custom_resources_per_learner: Optional[
Dict[str, Union[float, int]]
] = NotProvided,
num_aggregator_actors_per_learner: Optional[int] = NotProvided,
max_requests_in_flight_per_aggregator_actor: Optional[float] = NotProvided,
local_gpu_idx: Optional[int] = NotProvided,
Expand All @@ -2164,6 +2168,11 @@ def learners(
`num_learners=0`, any value greater than 0 runs the
training on a single GPU on the main process, while a value of 0 runs
the training on main process CPUs.
custom_resources_per_learner: A dict that specify custom resources allocated
per Learner worker. Similar to the GPU, if you declare a certain number
for NPU/HPU (which is already supported in ray train) greater than 0,
such as {"NPU": 1}, the training will run on the the corresponding
accelerator.
num_aggregator_actors_per_learner: The number of aggregator actors per
Learner (if num_learners=0, one local learner is created). Must be at
least 1. Aggregator actors perform the task of a) converting episodes
Expand Down Expand Up @@ -2196,6 +2205,8 @@ def learners(
self.num_cpus_per_learner = num_cpus_per_learner
if num_gpus_per_learner is not NotProvided:
self.num_gpus_per_learner = num_gpus_per_learner
if custom_resources_per_learner is not NotProvided:
self.custom_resources_per_learner = custom_resources_per_learner
if num_aggregator_actors_per_learner is not NotProvided:
self.num_aggregator_actors_per_learner = num_aggregator_actors_per_learner
if max_requests_in_flight_per_aggregator_actor is not NotProvided:
Expand Down
2 changes: 2 additions & 0 deletions rllib/core/learner/learner_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,11 @@ def __init__(
# TODO (sven): Activate this when Ray has figured out GPU pre-loading.
# - (0.01 * self.config.num_aggregator_actors_per_learner),
)
custom_resources_per_learner = self.config.custom_resources_per_learner
resources_per_learner = {
"CPU": num_cpus_per_learner,
"GPU": num_gpus_per_learner,
**custom_resources_per_learner,
}

backend_executor = BackendExecutor(
Expand Down
6 changes: 5 additions & 1 deletion rllib/core/learner/torch/torch_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,11 @@ def build(self) -> None:
after setting up all variables because `configure_optimizer_for_module` is
called in this `Learner.build()`.
"""
self._device = get_device(self.config, self.config.num_gpus_per_learner)
self._device = get_device(
self.config,
self.config.num_gpus_per_learner,
self.config.custom_resources_per_learner,
)

super().build()

Expand Down
1 change: 1 addition & 0 deletions rllib/env/multi_agent_env_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ def __init__(self, config: AlgorithmConfig, **kwargs):
self._device = get_device(
self.config,
0 if not self.worker_index else self.config.num_gpus_per_env_runner,
self.config.custom_resources_per_env_runner,
)

# Create the vectorized gymnasium env.
Expand Down
1 change: 1 addition & 0 deletions rllib/env/single_agent_env_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def __init__(self, *, config: AlgorithmConfig, **kwargs):
self._device = get_device(
self.config,
0 if not self.worker_index else self.config.num_gpus_per_env_runner,
self.config.custom_resources_per_env_runner,
)

# Create the vectorized gymnasium env.
Expand Down
14 changes: 13 additions & 1 deletion rllib/utils/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,20 @@ def convert_to_tensor(


@PublicAPI
def get_device(config: "AlgorithmConfig", num_gpus_requested: int = 1):
def get_device(
config: "AlgorithmConfig",
num_gpus_requested: int = 1,
custom_resources_requested: Optional[dict] = None,
):
"""Returns a single device (CPU or some GPU) depending on a config.

Args:
config: An AlgorithmConfig to extract information from about the device to use.
num_gpus_requested: The number of GPUs actually requested. This may be the value
of `config.num_gpus_per_env_runner` when for example calling this function
from an EnvRunner.
custom_resources_requested: Similar to the GPU, the dictionary contains the
number of accelerators actually requested.

Returns:
A single device (or name) given `config` and `num_gpus_requested`.
Expand Down Expand Up @@ -94,6 +100,12 @@ def get_device(config: "AlgorithmConfig", num_gpus_requested: int = 1):
# `torch.cuda.device_count() = 1` and torch.device(0) maps to that GPU
# with ID=1 on the node.
return torch.device(config.local_gpu_idx)
elif custom_resources_requested:
from ray.air._internal.torch_utils import get_devices

# The `get_devices()` api in ray.air should handle the custom accelerator
# and return torch.device("cpu") if not accelerator is available.
return get_devices()[0]
else:
return torch.device("cpu")
else:
Expand Down