-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Custom Device]add run_check support for custom device #56318
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -110,6 +110,10 @@ def _get_default_nprocs(): | |
return core.get_xpu_device_count() | ||
elif 'cpu' in device: | ||
return multiprocessing.cpu_count() | ||
elif 'npu' in device: | ||
return core.get_custom_device_count('npu') | ||
elif 'mlu' in device: | ||
return core.get_custom_device_count('mlu') | ||
else: | ||
raise RuntimeError( | ||
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format( | ||
|
@@ -126,6 +130,8 @@ def _get_default_backend(): | |
return 'bkcl' | ||
elif 'cpu' in device: | ||
return 'gloo' | ||
elif 'npu' or 'mlu' in device: | ||
return 'xccl' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上,这里修改为支持所有通过custom device注册的硬件类型,不要用过字符串判断。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
else: | ||
raise RuntimeError( | ||
"`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format( | ||
|
@@ -275,6 +281,29 @@ def _get_subprocess_env_list(nprocs, options): | |
assert ( | ||
_get_trainers_num() == 1 | ||
), "CPUONLY spawn doesn't support multi-trainer" | ||
elif options['backend'] == 'xccl': | ||
args.selected_devices = None | ||
custom_device_name = core.get_all_custom_device_type()[0] | ||
env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None) | ||
if env_devices is None or env_devices == "": | ||
env_devices_list = [ | ||
str(x) | ||
for x in range(core.get_custom_device_count(custom_device_name)) | ||
] | ||
else: | ||
env_devices_list = env_devices.split(',') | ||
|
||
if len(env_devices_list) < nprocs: | ||
raise RuntimeError( | ||
"the number of visible devices(%d) is less than the number " | ||
"of spawn processes(%d), please ensure that the correct " | ||
"`nprocs` argument is passed or the environment variable " | ||
"`FLAGS_selected_%ss` is correctly configured." | ||
% (len(env_devices_list), nprocs, custom_device_name) | ||
) | ||
args.selected_devices = ",".join( | ||
[str(env_devices_list[x]) for x in range(0, nprocs)] | ||
) | ||
|
||
# set other inner args | ||
args.node_ip = options.get('node_ip', None) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,7 +81,23 @@ def _is_xpu_available(): | |
return False | ||
|
||
|
||
def _run_dygraph_single(use_cuda, use_xpu): | ||
def _is_custom_device_available(): | ||
""" | ||
Check whether Custom device is available. | ||
""" | ||
try: | ||
assert len(paddle.framework.core.get_available_custom_device()) > 0 | ||
return True | ||
except Exception as e: | ||
logging.warning( | ||
"You are using Custom device version PaddlePaddle, but there is no Custom devices " | ||
"detected on your machine. Maybe Custom devices is not set properly." | ||
"\n Original Error is {}".format(e) | ||
) | ||
return False | ||
|
||
|
||
def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name): | ||
""" | ||
Testing the simple network in dygraph mode using one CPU/GPU/XPU. | ||
|
||
|
@@ -94,6 +110,8 @@ def _run_dygraph_single(use_cuda, use_xpu): | |
paddle.set_device('gpu') | ||
elif use_xpu: | ||
paddle.set_device('xpu') | ||
elif use_custom: | ||
paddle.set_device(custom_device_name) | ||
else: | ||
paddle.set_device('cpu') | ||
weight_attr = paddle.ParamAttr( | ||
|
@@ -116,7 +134,7 @@ def _run_dygraph_single(use_cuda, use_xpu): | |
opt.step() | ||
|
||
|
||
def _run_static_single(use_cuda, use_xpu): | ||
def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name): | ||
""" | ||
Testing the simple network with executor running directly, using one CPU/GPU/XPU. | ||
|
||
|
@@ -139,6 +157,8 @@ def _run_static_single(use_cuda, use_xpu): | |
place = paddle.CUDAPlace(0) | ||
elif use_xpu: | ||
place = paddle.XPUPlace(0) | ||
elif use_custom: | ||
place = paddle.CustomPlace(custom_device_name, 0) | ||
else: | ||
place = paddle.CPUPlace() | ||
|
||
|
@@ -229,29 +249,41 @@ def run_check(): | |
|
||
use_cuda = False | ||
use_xpu = False | ||
use_custom = False | ||
custom_device_name = None | ||
|
||
if paddle.is_compiled_with_cuda(): | ||
use_cuda = _is_cuda_available() | ||
elif paddle.is_compiled_with_xpu(): | ||
use_xpu = _is_xpu_available() | ||
elif len(paddle.framework.core.get_all_custom_device_type()) == 1: | ||
use_custom = _is_custom_device_available() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 这里259行的判断逻辑应该是 >0, 存在注册多个custom device的情况,另外 _is_custom_device_available 里面实现的逻辑和 259 行 elif的逻辑是同一个,判断条件重复,可以去掉一个。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
|
||
if use_cuda: | ||
device_str = "GPU" | ||
device_list = paddle.static.cuda_places() | ||
elif use_xpu: | ||
device_str = "XPU" | ||
device_list = paddle.static.xpu_places() | ||
elif use_custom: | ||
device_str = paddle.framework.core.get_all_custom_device_type()[0] | ||
custom_device_name = device_str | ||
device_list = paddle.framework.core.get_available_custom_device() | ||
else: | ||
device_str = "CPU" | ||
device_list = paddle.static.cpu_places(device_count=1) | ||
device_count = len(device_list) | ||
|
||
_run_static_single(use_cuda, use_xpu) | ||
_run_dygraph_single(use_cuda, use_xpu) | ||
_run_static_single(use_cuda, use_xpu, use_custom, custom_device_name) | ||
_run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name) | ||
print(f"PaddlePaddle works well on 1 {device_str}.") | ||
|
||
try: | ||
if len(device_list) > 1: | ||
if use_custom is True: | ||
import os | ||
|
||
os.environ['PADDLE_DISTRI_BACKEND'] = "xccl" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 分布式通过读取环境变量 |
||
_run_parallel(device_list) | ||
print( | ||
"PaddlePaddle works well on {} {}s.".format( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里改成适用所有custom device的方式,不要用字符串进行判断,只能支持 npu 和 mlu 两种硬件类型。
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done