From 102ab57087b3730e46825c8dd63e8e2b49f4129b Mon Sep 17 00:00:00 2001
From: haohongxiang <haohongxiang@baidu.com>
Date: Sat, 8 Oct 2022 09:23:47 +0000
Subject: [PATCH 1/2] fix perf of fuse op with allreduce

---
 paddle/fluid/operators/fused/fused_attention_op.cu   | 7 +++++--
 paddle/fluid/operators/fused/fused_feedforward_op.cu | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 62ea3f723dc9e9..e7df3561073772 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -50,13 +50,16 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup *pg = map->get(ring_id);
+    auto pg_strm = static_cast<distributed::ProcessGroupStream *>(pg);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL *>(pg_strm);
+
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;
     in_tensor.push_back(tensor);
     out_tensor.push_back(tensor);
     paddle::distributed::AllreduceOptions opts;
     opts.reduce_op = distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true);
     task->Wait();
   } else {
     auto dtype = platform::ToNCCLDataType(
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 6084b1f61f80c0..7b0a68dd1b1c97 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -43,13 +43,16 @@ static void AllReduce(phi::DenseTensor& tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup* pg = map->get(ring_id);
+    auto pg_strm = static_cast<distributed::ProcessGroupStream*>(pg);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL*>(pg_strm);
+
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;
     in_tensor.push_back(tensor);
     out_tensor.push_back(tensor);
     paddle::distributed::AllreduceOptions opts;
     opts.reduce_op = distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true);
     task->Wait();
   } else {
     auto dtype = platform::ToNCCLDataType(

From b5e32b7b86e9e0f137074970ea8f6fc67eab2fa2 Mon Sep 17 00:00:00 2001
From: haohongxiang <haohongxiang@baidu.com>
Date: Sat, 8 Oct 2022 16:41:04 +0000
Subject: [PATCH 2/2] update

---
 paddle/fluid/operators/fused/fused_attention_op.cu   | 3 +--
 paddle/fluid/operators/fused/fused_feedforward_op.cu | 3 +--
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index e7df3561073772..d03b76adef3f32 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -50,8 +50,7 @@ static void AllReduce(phi::DenseTensor &tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup *pg = map->get(ring_id);
-    auto pg_strm = static_cast<distributed::ProcessGroupStream *>(pg);
-    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL *>(pg_strm);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL *>(pg);
 
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 7b0a68dd1b1c97..90af129296c8e6 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -43,8 +43,7 @@ static void AllReduce(phi::DenseTensor& tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup* pg = map->get(ring_id);
-    auto pg_strm = static_cast<distributed::ProcessGroupStream*>(pg);
-    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL*>(pg_strm);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL*>(pg);
 
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;