Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Custom Device]add run_check support for custom device #56318

Merged
merged 6 commits into from
Aug 17, 2023
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions python/paddle/distributed/spawn.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,10 @@ def _get_default_nprocs():
return core.get_xpu_device_count()
elif 'cpu' in device:
return multiprocessing.cpu_count()
elif 'npu' in device:
return core.get_custom_device_count('npu')
elif 'mlu' in device:
return core.get_custom_device_count('mlu')
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里改成适用所有custom device的方式,不要用字符串进行判断,只能支持 npu 和 mlu 两种硬件类型。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
Expand All @@ -126,6 +130,8 @@ def _get_default_backend():
return 'bkcl'
elif 'cpu' in device:
return 'gloo'
elif 'npu' or 'mlu' in device:
return 'xccl'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上,这里修改为支持所有通过custom device注册的硬件类型,不要用过字符串判断。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

else:
raise RuntimeError(
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
Expand Down Expand Up @@ -275,6 +281,29 @@ def _get_subprocess_env_list(nprocs, options):
assert (
_get_trainers_num() == 1
), "CPUONLY spawn doesn't support multi-trainer"
elif options['backend'] == 'xccl':
args.selected_devices = None
custom_device_name = core.get_all_custom_device_type()[0]
env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None)
if env_devices is None or env_devices == "":
env_devices_list = [
str(x)
for x in range(core.get_custom_device_count(custom_device_name))
]
else:
env_devices_list = env_devices.split(',')

if len(env_devices_list) < nprocs:
raise RuntimeError(
"the number of visible devices(%d) is less than the number "
"of spawn processes(%d), please ensure that the correct "
"`nprocs` argument is passed or the environment variable "
"`FLAGS_selected_%ss` is correctly configured."
% (len(env_devices_list), nprocs, custom_device_name)
)
args.selected_devices = ",".join(
[str(env_devices_list[x]) for x in range(0, nprocs)]
)

# set other inner args
args.node_ip = options.get('node_ip', None)
Expand Down
12 changes: 12 additions & 0 deletions python/paddle/distributed/utils/launch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"PADDLE_DISTRI_BACKEND": backend, # only add here, other will be auto
}
elif backend == 'xccl':
from paddle.framework import core

custom_device_name = core.get_all_custom_device_type()[0]
proc_env = {
f"FLAGS_selected_{custom_device_name}s": "%s"
% ",".join([str(g) for g in trainer.gpus]),
"PADDLE_TRAINER_ID": "%d" % trainer.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
}
else:
raise ValueError("backend must be one of 'gloo, nccl, bkcl'")

Expand Down
40 changes: 36 additions & 4 deletions python/paddle/utils/install_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,23 @@ def _is_xpu_available():
return False


def _run_dygraph_single(use_cuda, use_xpu):
def _is_custom_device_available():
"""
Check whether Custom device is available.
"""
try:
assert len(paddle.framework.core.get_available_custom_device()) > 0
return True
except Exception as e:
logging.warning(
"You are using Custom device version PaddlePaddle, but there is no Custom devices "
"detected on your machine. Maybe Custom devices is not set properly."
"\n Original Error is {}".format(e)
)
return False


def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network in dygraph mode using one CPU/GPU/XPU.

Expand All @@ -94,6 +110,8 @@ def _run_dygraph_single(use_cuda, use_xpu):
paddle.set_device('gpu')
elif use_xpu:
paddle.set_device('xpu')
elif use_custom:
paddle.set_device(custom_device_name)
else:
paddle.set_device('cpu')
weight_attr = paddle.ParamAttr(
Expand All @@ -116,7 +134,7 @@ def _run_dygraph_single(use_cuda, use_xpu):
opt.step()


def _run_static_single(use_cuda, use_xpu):
def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
"""
Testing the simple network with executor running directly, using one CPU/GPU/XPU.

Expand All @@ -139,6 +157,8 @@ def _run_static_single(use_cuda, use_xpu):
place = paddle.CUDAPlace(0)
elif use_xpu:
place = paddle.XPUPlace(0)
elif use_custom:
place = paddle.CustomPlace(custom_device_name, 0)
else:
place = paddle.CPUPlace()

Expand Down Expand Up @@ -229,29 +249,41 @@ def run_check():

use_cuda = False
use_xpu = False
use_custom = False
custom_device_name = None

if paddle.is_compiled_with_cuda():
use_cuda = _is_cuda_available()
elif paddle.is_compiled_with_xpu():
use_xpu = _is_xpu_available()
elif len(paddle.framework.core.get_all_custom_device_type()) == 1:
use_custom = _is_custom_device_available()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里259行的判断逻辑应该是 >0, 存在注册多个custom device的情况,另外 _is_custom_device_available 里面实现的逻辑和 259 行 elif的逻辑是同一个,判断条件重复,可以去掉一个。

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done


if use_cuda:
device_str = "GPU"
device_list = paddle.static.cuda_places()
elif use_xpu:
device_str = "XPU"
device_list = paddle.static.xpu_places()
elif use_custom:
device_str = paddle.framework.core.get_all_custom_device_type()[0]
custom_device_name = device_str
device_list = paddle.framework.core.get_available_custom_device()
else:
device_str = "CPU"
device_list = paddle.static.cpu_places(device_count=1)
device_count = len(device_list)

_run_static_single(use_cuda, use_xpu)
_run_dygraph_single(use_cuda, use_xpu)
_run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
_run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
print(f"PaddlePaddle works well on 1 {device_str}.")

try:
if len(device_list) > 1:
if use_custom is True:
import os

os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

分布式通过读取环境变量PADDLE_DISTRI_BACKEND设置backend,默认值为auto。这里检出使用custom device后手动设置backend为xccl,避免设置错误的backend。

_run_parallel(device_list)
print(
"PaddlePaddle works well on {} {}s.".format(
Expand Down