From 8b91dbdd8cdd8733140845cbfce0e1b047e5f20d Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 5 Apr 2023 18:01:56 +0800 Subject: [PATCH 1/3] register bf16 for communication ops --- paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc | 3 +++ paddle/fluid/operators/collective/c_split_op.cu | 3 +++ paddle/fluid/operators/collective/partial_allgather_op.cu.cc | 3 +++ paddle/fluid/operators/collective/partial_recv_op.cu.cc | 3 +++ paddle/fluid/operators/collective/partial_send_op.cu.cc | 3 +++ 5 files changed, 15 insertions(+) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index 6ac228ca53dd51..8789820d4f8a02 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -28,6 +28,9 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, ALL_LAYOUT, ops::CAllReduceMaxCUDAKernel, float, +#if NCCL_VERSION_CODE >= 21000 + bfloat16, +#endif double, int, int64_t, diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 43dd64a5fdd7eb..c76e139b22b200 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -117,6 +117,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel, +#if NCCL_VERSION_CODE >= 21000 + ops::CSplitOpCUDAKernel, +#endif ops::CSplitOpCUDAKernel, ops::CSplitOpCUDAKernel, ops::CSplitOpCUDAKernel, diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc index 94ad2432e46ff8..ce5a5438eff555 100644 --- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc @@ -104,6 +104,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(partial_allgather, ops::PartialAllGatherOpCUDAKernel, +#if NCCL_VERSION_CODE >= 21000 + ops::PartialAllGatherOpCUDAKernel, +#endif ops::PartialAllGatherOpCUDAKernel, ops::PartialAllGatherOpCUDAKernel, ops::PartialAllGatherOpCUDAKernel, diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc index f9fb0ce1862324..306175d1ca7af8 100644 --- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc @@ -120,6 +120,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(partial_recv, ops::PartialRecvOpCUDAKernel, +#if NCCL_VERSION_CODE >= 21000 + ops::PartialRecvOpCUDAKernel, +#endif ops::PartialRecvOpCUDAKernel, ops::PartialRecvOpCUDAKernel, ops::PartialRecvOpCUDAKernel, diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc index c73fa779c50693..afac7f963fa0dc 100644 --- a/paddle/fluid/operators/collective/partial_send_op.cu.cc +++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc @@ -120,6 +120,9 @@ namespace plat = paddle::platform; REGISTER_OP_CUDA_KERNEL(partial_send, ops::PartialSendCUDAKernel, ops::PartialSendCUDAKernel, +#if NCCL_VERSION_CODE >= 21000 + ops::PartialSendCUDAKernel, +#endif ops::PartialSendCUDAKernel, ops::PartialSendCUDAKernel, ops::PartialSendCUDAKernel); From c9c82167dedcc44a0ca18349fa119500d037c79a Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 5 Apr 2023 20:01:22 +0800 Subject: [PATCH 2/3] formatting --- paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index 8789820d4f8a02..3cac02a2ccabc6 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -34,4 +34,5 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, double, int, int64_t, - plat::float16) {} + plat::float16) { +} From 1f738554288a62e474d24af095e2c712c27aa1b9 Mon Sep 17 00:00:00 2001 From: shaojiewang Date: Wed, 5 Apr 2023 21:39:36 +0800 Subject: [PATCH 3/3] fix bfloat16 type finding compile error in c_allreduce_max_op --- paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc index 3cac02a2ccabc6..9be9674bb082bd 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc @@ -29,7 +29,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max, ops::CAllReduceMaxCUDAKernel, float, #if NCCL_VERSION_CODE >= 21000 - bfloat16, + plat::bfloat16, #endif double, int,