Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Custom Device]add run_check support for custom device #56318

Merged
merged 6 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions python/paddle/distributed/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ def _get_default_nprocs():
return core.get_xpu_device_count()
elif 'cpu' in device:
return multiprocessing.cpu_count()
elif device in core.get_available_custom_device():
return core.get_custom_device_count(device.split(":")[0])
else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
Expand All @@ -126,6 +128,8 @@ def _get_default_backend():
return 'bkcl'
elif 'cpu' in device:
return 'gloo'
elif device in core.get_available_custom_device():
return 'xccl'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上,这里修改为支持所有通过custom device注册的硬件类型,不要用过字符串判断。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
Expand Down Expand Up @@ -275,6 +279,29 @@ def _get_subprocess_env_list(nprocs, options):
assert (
_get_trainers_num() == 1
), "CPUONLY spawn doesn't support multi-trainer"
elif options['backend'] == 'xccl':
args.selected_devices = None
custom_device_name = core.get_all_custom_device_type()[0]
env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None)
if env_devices is None or env_devices == "":
env_devices_list = [
str(x)
for x in range(core.get_custom_device_count(custom_device_name))
]
else:
env_devices_list = env_devices.split(',')

if len(env_devices_list) < nprocs:
raise RuntimeError(
"the number of visible devices(%d) is less than the number "
"of spawn processes(%d), please ensure that the correct "
"`nprocs` argument is passed or the environment variable "
"`FLAGS_selected_%ss` is correctly configured."
% (len(env_devices_list), nprocs, custom_device_name)
)
args.selected_devices = ",".join(
[str(env_devices_list[x]) for x in range(0, nprocs)]
)

# set other inner args
args.node_ip = options.get('node_ip', None)
Expand Down
12 changes: 12 additions & 0 deletions python/paddle/distributed/utils/launch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"PADDLE_DISTRI_BACKEND": backend, # only add here, other will be auto
}
elif backend == 'xccl':
from paddle.framework import core

custom_device_name = core.get_all_custom_device_type()[0]
proc_env = {
f"FLAGS_selected_{custom_device_name}s": "%s"
% ",".join([str(g) for g in trainer.gpus]),
"PADDLE_TRAINER_ID": "%d" % trainer.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
else:
raise ValueError("backend must be one of 'gloo, nccl, bkcl'")

Expand Down
36 changes: 32 additions & 4 deletions python/paddle/utils/install_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def _is_xpu_available():
return False


def _run_dygraph_single(use_cuda, use_xpu):
def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network in dygraph mode using one CPU/GPU/XPU.

Expand All @@ -94,6 +94,8 @@ def _run_dygraph_single(use_cuda, use_xpu):
paddle.set_device('gpu')
elif use_xpu:
paddle.set_device('xpu')
elif use_custom:
paddle.set_device(custom_device_name)
else:
paddle.set_device('cpu')
weight_attr = paddle.ParamAttr(
Expand All @@ -116,7 +118,7 @@ def _run_dygraph_single(use_cuda, use_xpu):
opt.step()


def _run_static_single(use_cuda, use_xpu):
def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network with executor running directly, using one CPU/GPU/XPU.

Expand All @@ -139,6 +141,8 @@ def _run_static_single(use_cuda, use_xpu):
place = paddle.CUDAPlace(0)
elif use_xpu:
place = paddle.XPUPlace(0)
elif use_custom:
place = paddle.CustomPlace(custom_device_name, 0)
else:
place = paddle.CPUPlace()

Expand Down Expand Up @@ -229,29 +233,53 @@ def run_check():

use_cuda = False
use_xpu = False
use_custom = False
custom_device_name = None

if paddle.is_compiled_with_cuda():
use_cuda = _is_cuda_available()
elif paddle.is_compiled_with_xpu():
use_xpu = _is_xpu_available()
elif len(paddle.framework.core.get_all_custom_device_type()) > 0:
use_custom = True
if len(paddle.framework.core.get_all_custom_device_type()) > 1:
logging.warning(
"More than one kind of custom devices detected, but run check would only be executed on {}.".format(
paddle.framework.core.get_all_custom_device_type()[0]
)
)

if use_cuda:
device_str = "GPU"
device_list = paddle.static.cuda_places()
elif use_xpu:
device_str = "XPU"
device_list = paddle.static.xpu_places()
elif use_custom:
device_str = paddle.framework.core.get_all_custom_device_type()[0]
custom_device_name = device_str
device_list = list(
range(
paddle.framework.core.get_custom_device_count(
custom_device_name
)
)
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里默认只跑 device[0],判断一下如果有多个device注册,这里加点warning message提示下只对你device[0]进行检测

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

else:
device_str = "CPU"
device_list = paddle.static.cpu_places(device_count=1)
device_count = len(device_list)

_run_static_single(use_cuda, use_xpu)
_run_dygraph_single(use_cuda, use_xpu)
_run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
_run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
print(f"PaddlePaddle works well on 1 {device_str}.")

try:
if len(device_list) > 1:
if use_custom:
import os

os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

分布式通过读取环境变量PADDLE_DISTRI_BACKEND设置backend,默认值为auto。这里检出使用custom device后手动设置backend为xccl,避免设置错误的backend。

_run_parallel(device_list)
print(
"PaddlePaddle works well on {} {}s.".format(
Expand Down