From 102ab57087b3730e46825c8dd63e8e2b49f4129b Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sat, 8 Oct 2022 09:23:47 +0000 Subject: [PATCH 1/2] fix perf of fuse op with allreduce --- paddle/fluid/operators/fused/fused_attention_op.cu | 7 +++++-- paddle/fluid/operators/fused/fused_feedforward_op.cu | 7 +++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index 62ea3f723dc9e9..e7df3561073772 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -50,13 +50,16 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup *pg = map->get(ring_id); + auto pg_strm = static_cast(pg); + auto pg_nccl = static_cast(pg_strm); + std::vector in_tensor; std::vector out_tensor; in_tensor.push_back(tensor); out_tensor.push_back(tensor); paddle::distributed::AllreduceOptions opts; opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); + auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true); task->Wait(); } else { auto dtype = platform::ToNCCLDataType( diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 6084b1f61f80c0..7b0a68dd1b1c97 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/elementwise_functor.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) -#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #endif @@ -43,13 +43,16 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup* pg = map->get(ring_id); + auto pg_strm = static_cast(pg); + auto pg_nccl = static_cast(pg_strm); + std::vector in_tensor; std::vector out_tensor; in_tensor.push_back(tensor); out_tensor.push_back(tensor); paddle::distributed::AllreduceOptions opts; opts.reduce_op = distributed::ReduceOp::SUM; - auto task = pg->AllReduce(in_tensor, out_tensor, opts); + auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true); task->Wait(); } else { auto dtype = platform::ToNCCLDataType( From b5e32b7b86e9e0f137074970ea8f6fc67eab2fa2 Mon Sep 17 00:00:00 2001 From: haohongxiang Date: Sat, 8 Oct 2022 16:41:04 +0000 Subject: [PATCH 2/2] update --- paddle/fluid/operators/fused/fused_attention_op.cu | 3 +-- paddle/fluid/operators/fused/fused_feedforward_op.cu | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu index e7df3561073772..d03b76adef3f32 100644 --- a/paddle/fluid/operators/fused/fused_attention_op.cu +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -50,8 +50,7 @@ static void AllReduce(phi::DenseTensor &tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup *pg = map->get(ring_id); - auto pg_strm = static_cast(pg); - auto pg_nccl = static_cast(pg_strm); + auto pg_nccl = static_cast(pg); std::vector in_tensor; std::vector out_tensor; diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu index 7b0a68dd1b1c97..90af129296c8e6 100644 --- a/paddle/fluid/operators/fused/fused_feedforward_op.cu +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -43,8 +43,7 @@ static void AllReduce(phi::DenseTensor& tensor, // NOLINT if (map->has(ring_id)) { paddle::distributed::ProcessGroup* pg = map->get(ring_id); - auto pg_strm = static_cast(pg); - auto pg_nccl = static_cast(pg_strm); + auto pg_nccl = static_cast(pg); std::vector in_tensor; std::vector out_tensor;