From d05b73e4ca99df5a7531f283ab09fd2aeadb10ec Mon Sep 17 00:00:00 2001
From: huangjiyi <43315610+huangjiyi@users.noreply.github.com>
Date: Fri, 31 Mar 2023 19:43:25 +0800
Subject: [PATCH] register fluid kerenls to phi [part 2] (#52044)

* update bipartite_match

* update

* fix bug

* fix test

* fix bug

* fix Kunlun-KP-Build

* Revert "fix Kunlun-KP-Build"

This reverts commit ceab63cc23079fd6839c826bb52db893fb056355.

* update
---
 paddle/fluid/operators/bpr_loss_op.cc         | 16 +++++++-----
 paddle/fluid/operators/bpr_loss_op.h          |  4 +--
 .../operators/collective/c_allgather_op.cc    | 21 ++++++++-------
 .../operators/collective/c_allgather_op.cu.cc | 26 +++++++++++--------
 .../operators/collective/c_allgather_op.h     |  2 +-
 .../collective/c_allreduce_max_op.cc          | 18 ++++++++-----
 .../collective/c_allreduce_max_op.cu.cc       | 22 +++++++++++-----
 .../collective/c_allreduce_min_op.cc          | 17 +++++++-----
 .../collective/c_allreduce_min_op.cu.cc       | 22 +++++++++++-----
 .../operators/collective/c_allreduce_op.h     |  8 ++++++
 .../collective/c_allreduce_prod_op.cc         | 17 +++++++-----
 .../collective/c_allreduce_prod_op.cu.cc      | 22 +++++++++++-----
 .../collective/c_allreduce_sum_op.cc          | 17 +++++++-----
 .../collective/c_allreduce_sum_op.cu.cc       | 25 ++++++++++++------
 .../operators/detection/bipartite_match_op.cc | 12 ++++++---
 .../fluid/operators/detection/box_clip_op.cc  |  6 ++---
 .../fluid/operators/detection/box_clip_op.cu  |  8 +++---
 .../fluid/operators/detection/box_clip_op.h   |  2 +-
 .../detection/box_decoder_and_assign_op.cc    |  9 ++++---
 .../detection/box_decoder_and_assign_op.cu    | 13 ++++++----
 .../detection/box_decoder_and_assign_op.h     |  2 +-
 .../unittests/test_bipartite_match_op.py      | 10 +++----
 .../fluid/tests/unittests/test_box_clip_op.py |  2 +-
 23 files changed, 190 insertions(+), 111 deletions(-)
diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc
index 47aea124430373..4a2928338251e1 100644
--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -174,7 +174,6 @@ class BprLossGradMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-using CPUCtx = phi::CPUContext;
 
 REGISTER_OPERATOR(bpr_loss,
                   ops::BprLossOp,
@@ -182,9 +181,12 @@ REGISTER_OPERATOR(bpr_loss,
                   ops::BprLossGradMaker<paddle::framework::OpDesc>,
                   ops::BprLossGradMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
-REGISTER_OP_CPU_KERNEL(bpr_loss,
-                       ops::BprLossOpKernel<CPUCtx, float>,
-                       ops::BprLossOpKernel<CPUCtx, double>);
-REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
-                       ops::BprLossGradientOpKernel<CPUCtx, float>,
-                       ops::BprLossGradientOpKernel<CPUCtx, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    bpr_loss, CPU, ALL_LAYOUT, ops::BprLossOpKernel, float, double) {}
+PD_REGISTER_STRUCT_KERNEL(bpr_loss_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::BprLossGradientOpKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index 679f2687849c38..1a1bddc17c2e59 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -35,7 +35,7 @@ struct TolerableValue {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class BprLossOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -83,7 +83,7 @@ class BprLossOpKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class BprLossGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc
index 83449de91017db..4e9cadbddf4865 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cc
@@ -73,12 +73,15 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allgather,
                              ops::CAllGatherOp,
                              ops::CAllGatherOpMaker);
 
-REGISTER_OP_CPU_KERNEL(c_allgather,
-                       ops::CAllGatherOpCPUKernel<float>,
-                       ops::CAllGatherOpCPUKernel<double>,
-                       ops::CAllGatherOpCPUKernel<int>,
-                       ops::CAllGatherOpCPUKernel<int64_t>,
-                       ops::CAllGatherOpCPUKernel<uint8_t>,
-                       ops::CAllGatherOpCPUKernel<int8_t>,
-                       ops::CAllGatherOpCPUKernel<bool>,
-                       ops::CAllGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(c_allgather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CAllGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int8_t,
+                          int64_t,
+                          uint8_t,
+                          bool,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 075abc49a72baa..93be43a1a324a6 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -93,15 +93,19 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(c_allgather,
-                        ops::CAllGatherOpCUDAKernel<float>,
-                        ops::CAllGatherOpCUDAKernel<double>,
+PD_REGISTER_STRUCT_KERNEL(c_allgather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::CAllGatherOpCUDAKernel,
+                          float,
+                          double,
 #if NCCL_VERSION_CODE >= 21000
-                        ops::CAllGatherOpCUDAKernel<plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-                        ops::CAllGatherOpCUDAKernel<int>,
-                        ops::CAllGatherOpCUDAKernel<uint8_t>,
-                        ops::CAllGatherOpCUDAKernel<int8_t>,
-                        ops::CAllGatherOpCUDAKernel<int64_t>,
-                        ops::CAllGatherOpCUDAKernel<bool>,
-                        ops::CAllGatherOpCUDAKernel<plat::float16>);
+                          int,
+                          uint8_t,
+                          int8_t,
+                          int64_t,
+                          bool,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/collective/c_allgather_op.h b/paddle/fluid/operators/collective/c_allgather_op.h
index 198ec4009f4d37..e896f96ead5329 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.h
+++ b/paddle/fluid/operators/collective/c_allgather_op.h
@@ -32,7 +32,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
index d339a51368a1af..c47bf7025e1fd0 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc
@@ -35,6 +35,8 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});
 
+DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMax, kRedMax)
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -45,10 +47,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max,
                              ops::CAllReduceOp,
                              ops::CAllReduceMaxOpMaker,
                              ops::AllreduceMaxInplaceInferer)
-
-REGISTER_OP_CPU_KERNEL(c_allreduce_max,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMax, plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceMaxCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index fb073c8173d30a..6ac228ca53dd51 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -14,13 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceMax, kRedMax)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_max,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceMaxCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
index 8c1dd0172a808a..c21337a27202e1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc
@@ -35,6 +35,8 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"});
 
+DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMin, kRedMin)
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -46,9 +48,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min,
                              ops::CAllReduceMinOpMaker,
                              ops::AllreduceMinInplaceInferer)
 
-REGISTER_OP_CPU_KERNEL(c_allreduce_min,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedMin, plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceMinCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
index c80a224f460dc9..4475abdef281b7 100644
--- a/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
@@ -14,13 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceMin, kRedMin)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_min,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceMinCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h
index f63c4a9abcdc6e..e292864a8e7748 100644
--- a/paddle/fluid/operators/collective/c_allreduce_op.h
+++ b/paddle/fluid/operators/collective/c_allreduce_op.h
@@ -148,6 +148,10 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
   }
 };
 
+#define DEFINE_C_ALLREDUCE_CPU_KERNEL(op_name, red_type) \
+  template <typename T, typename DeviceContext>          \
+  class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
+
 #if defined(PADDLE_WITH_ASCEND_CL)
 // return true if found_nan or return false;
 inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
@@ -527,6 +531,10 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
   }
 };
 
+#define DEFINE_C_ALLREDUCE_CUDA_KERNEL(op_name, red_type) \
+  template <typename T, typename DeviceContext>           \
+  class op_name##CUDAKernel : public CAllReduceOpCUDAKernel<red_type, T> {};
+
 template <ReduceType red_type, typename T>
 class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
  public:
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
index 1d07b8226ab86d..ee40f29d789e15 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc
@@ -35,6 +35,8 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});
 
+DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceProd, kRedProd)
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -46,9 +48,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod,
                              ops::CAllReduceProdOpMaker,
                              ops::AllreduceProdInplaceInferer)
 
-REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedProd, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceProdCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
index 1ca4b6ecc18952..c63a1d2182678b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
@@ -14,13 +14,21 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceProd, kRedProd)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_prod,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, float>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceProdCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
index 1fd77d3ab96d6c..79e70757fbcfdd 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc
@@ -56,6 +56,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker {
 
 DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"});
 
+DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceSum, kRedSum)
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -67,9 +69,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_sum,
                              ops::CAllReduceSumOpMaker,
                              ops::AllreduceSumInplaceInferer)
 
-REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceSumCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 5e00c3bdba3fdc..8e45b7e43b2ed1 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -14,16 +14,25 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceSum, kRedSum)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(
-    c_allreduce_sum,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::CAllReduceSumCUDAKernel,
+                          float,
 #if NCCL_VERSION_CODE >= 21000
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 8bf542e17caed0..29b732536437d2 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -64,7 +64,7 @@ bool DistPairDescend(std::tuple<int, int, T> pair1,
   return std::get<2>(pair1) > std::get<2>(pair2);
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class BipartiteMatchKernel : public framework::OpKernel<T> {
  public:
   // The match_indices must be initialized to -1 at first.
@@ -318,6 +318,10 @@ REGISTER_OPERATOR(
     ops::BipartiteMatchOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(bipartite_match,
-                       ops::BipartiteMatchKernel<float>,
-                       ops::BipartiteMatchKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(bipartite_match,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::BipartiteMatchKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/box_clip_op.cc b/paddle/fluid/operators/detection/box_clip_op.cc
index 46363ceb5355ea..5af100b8f64071 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cc
+++ b/paddle/fluid/operators/detection/box_clip_op.cc
@@ -104,6 +104,6 @@ REGISTER_OPERATOR(
     ops::BoxClipOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(box_clip,
-                       ops::BoxClipKernel<phi::CPUContext, float>,
-                       ops::BoxClipKernel<phi::CPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    box_clip, CPU, ALL_LAYOUT, ops::BoxClipKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 4a1558de906b3c..ef0f2439c5ee1c 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -44,7 +44,7 @@ static __global__ void GPUBoxClip(const T *input,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GPUBoxClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
@@ -74,6 +74,6 @@ class GPUBoxClipKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(box_clip,
-                        ops::GPUBoxClipKernel<phi::GPUContext, float>,
-                        ops::GPUBoxClipKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    box_clip, GPU, ALL_LAYOUT, ops::GPUBoxClipKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index cb067f91662ed0..c07185dec167c5 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class BoxClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
index 7de0f05e0d3ba4..8c97523559de64 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cc
@@ -225,6 +225,9 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(box_decoder_and_assign,
-                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, float>,
-                       ops::BoxDecoderAndAssignKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(box_decoder_and_assign,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::BoxDecoderAndAssignKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index daae995de0d5ea..a956a58ac75f7c 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -95,7 +95,7 @@ __global__ void AssignBoxKernel(const T* prior_box_data,
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -150,7 +150,10 @@ class BoxDecoderAndAssignCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    box_decoder_and_assign,
-    ops::BoxDecoderAndAssignCUDAKernel<phi::GPUContext, float>,
-    ops::BoxDecoderAndAssignCUDAKernel<phi::GPUContext, double>);
+
+PD_REGISTER_STRUCT_KERNEL(box_decoder_and_assign,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::BoxDecoderAndAssignCUDAKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index 31f03a76ba2451..5a191ffaf44746 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class BoxDecoderAndAssignKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
diff --git a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
index 5dc8f8f5d691c0..b42750d84b9a9b 100644
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
@@ -105,7 +105,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestBipartiteMatchOpWithoutLoD(OpTest):
@@ -122,7 +122,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestBipartiteMatchOpWithoutLoDLargeScaleInput(OpTest):
@@ -139,7 +139,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestBipartiteMatchOpWithPerPredictionType(OpTest):
@@ -162,7 +162,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 class TestBipartiteMatchOpWithEmptyLoD(OpTest):
@@ -179,7 +179,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_box_clip_op.py b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
index b785a800458ddd..1f7f3e9672541a 100644
--- a/python/paddle/fluid/tests/unittests/test_box_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_clip_op.py
@@ -52,7 +52,7 @@ def batch_box_clip(input_boxes, im_info, lod):
 
 class TestBoxClipOp(OpTest):
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_dygraph=False)
 
     def setUp(self):
         self.op_type = "box_clip"