PaddlePaddle · ronny1996 · Aug 17, 2023 · Aug 15, 2023 · Aug 15, 2023 · Aug 15, 2023
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
@@ -110,6 +110,8 @@ def _get_default_nprocs():
         return core.get_xpu_device_count()
     elif 'cpu' in device:
         return multiprocessing.cpu_count()
+    elif device in core.get_available_custom_device():
+        return core.get_custom_device_count(device.split(":")[0])
     else:
         raise RuntimeError(
             "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
@@ -126,6 +128,8 @@ def _get_default_backend():
         return 'bkcl'
     elif 'cpu' in device:
         return 'gloo'
+    elif device in core.get_available_custom_device():
+        return 'xccl'
     else:
         raise RuntimeError(
             "`paddle.distributed.spawn` does not support parallel training on device `{}` now.".format(
@@ -275,6 +279,29 @@ def _get_subprocess_env_list(nprocs, options):
         assert (
             _get_trainers_num() == 1
         ), "CPUONLY spawn doesn't support multi-trainer"
+    elif options['backend'] == 'xccl':
+        args.selected_devices = None
+        custom_device_name = core.get_all_custom_device_type()[0]
+        env_devices = os.getenv(f"FLAGS_selected_{custom_device_name}s", None)
+        if env_devices is None or env_devices == "":
+            env_devices_list = [
+                str(x)
+                for x in range(core.get_custom_device_count(custom_device_name))
+            ]
+        else:
+            env_devices_list = env_devices.split(',')
+
+        if len(env_devices_list) < nprocs:
+            raise RuntimeError(
+                "the number of visible devices(%d) is less than the number "
+                "of spawn processes(%d), please ensure that the correct "
+                "`nprocs` argument is passed or the environment variable "
+                "`FLAGS_selected_%ss` is correctly configured."
+                % (len(env_devices_list), nprocs, custom_device_name)
+            )
+        args.selected_devices = ",".join(
+            [str(env_devices_list[x]) for x in range(0, nprocs)]
+        )
 
     # set other inner args
     args.node_ip = options.get('node_ip', None)

diff --git a/python/paddle/distributed/utils/launch_utils.py b/python/paddle/distributed/utils/launch_utils.py
@@ -437,6 +437,18 @@ def _prepare_trainer_env(cluster, trainer, backend=None):
             "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
             "PADDLE_DISTRI_BACKEND": backend,  # only add here, other will be auto
         }
+    elif backend == 'xccl':
+        from paddle.framework import core
+
+        custom_device_name = core.get_all_custom_device_type()[0]
+        proc_env = {
+            f"FLAGS_selected_{custom_device_name}s": "%s"
+            % ",".join([str(g) for g in trainer.gpus]),
+            "PADDLE_TRAINER_ID": "%d" % trainer.rank,
+            "PADDLE_CURRENT_ENDPOINT": "%s" % trainer.endpoint,
+            "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
+            "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
+        }
     else:
         raise ValueError("backend must be one of 'gloo, nccl, bkcl'")
 

diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py
@@ -81,7 +81,7 @@ def _is_xpu_available():
         return False
 
 
-def _run_dygraph_single(use_cuda, use_xpu):
+def _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name):
     """
     Testing the simple network in dygraph mode using one CPU/GPU/XPU.
 
@@ -94,6 +94,8 @@ def _run_dygraph_single(use_cuda, use_xpu):
         paddle.set_device('gpu')
     elif use_xpu:
         paddle.set_device('xpu')
+    elif use_custom:
+        paddle.set_device(custom_device_name)
     else:
         paddle.set_device('cpu')
     weight_attr = paddle.ParamAttr(
@@ -116,7 +118,7 @@ def _run_dygraph_single(use_cuda, use_xpu):
     opt.step()
 
 
-def _run_static_single(use_cuda, use_xpu):
+def _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name):
     """
     Testing the simple network with executor running directly, using one CPU/GPU/XPU.
 
@@ -139,6 +141,8 @@ def _run_static_single(use_cuda, use_xpu):
             place = paddle.CUDAPlace(0)
         elif use_xpu:
             place = paddle.XPUPlace(0)
+        elif use_custom:
+            place = paddle.CustomPlace(custom_device_name, 0)
         else:
             place = paddle.CPUPlace()
 
@@ -229,29 +233,53 @@ def run_check():
 
     use_cuda = False
     use_xpu = False
+    use_custom = False
+    custom_device_name = None
 
     if paddle.is_compiled_with_cuda():
         use_cuda = _is_cuda_available()
     elif paddle.is_compiled_with_xpu():
         use_xpu = _is_xpu_available()
+    elif len(paddle.framework.core.get_all_custom_device_type()) > 0:
+        use_custom = True
+        if len(paddle.framework.core.get_all_custom_device_type()) > 1:
+            logging.warning(
+                "More than one kind of custom devices detected, but run check would only be executed on {}.".format(
+                    paddle.framework.core.get_all_custom_device_type()[0]
+                )
+            )
 
     if use_cuda:
         device_str = "GPU"
         device_list = paddle.static.cuda_places()
     elif use_xpu:
         device_str = "XPU"
         device_list = paddle.static.xpu_places()
+    elif use_custom:
+        device_str = paddle.framework.core.get_all_custom_device_type()[0]
+        custom_device_name = device_str
+        device_list = list(
+            range(
+                paddle.framework.core.get_custom_device_count(
+                    custom_device_name
+                )
+            )
+        )
     else:
         device_str = "CPU"
         device_list = paddle.static.cpu_places(device_count=1)
     device_count = len(device_list)
 
-    _run_static_single(use_cuda, use_xpu)
-    _run_dygraph_single(use_cuda, use_xpu)
+    _run_static_single(use_cuda, use_xpu, use_custom, custom_device_name)
+    _run_dygraph_single(use_cuda, use_xpu, use_custom, custom_device_name)
     print(f"PaddlePaddle works well on 1 {device_str}.")
 
     try:
         if len(device_list) > 1:
+            if use_custom:
+                import os
+
+                os.environ['PADDLE_DISTRI_BACKEND'] = "xccl"
             _run_parallel(device_list)
             print(
                 "PaddlePaddle works well on {} {}s.".format(