PaddlePaddle · huangjiyi · Apr 4, 2023 · Apr 4, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/paddle/fluid/operators/amp/get_float_status_op.cc b/paddle/fluid/operators/amp/get_float_status_op.cc
@@ -53,7 +53,7 @@ class GetFloatStatusMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class GetFloatStatusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -75,4 +75,5 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(get_float_status, ops::GetFloatStatusKernel<CPU, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    get_float_status, CPU, ALL_LAYOUT, ops::GetFloatStatusKernel, float) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.cc b/paddle/fluid/operators/collective/global_gather_op.cc
@@ -111,9 +111,12 @@ REGISTER_OPERATOR(global_gather,
                   ops::GlobalGatherOpGradMaker<paddle::framework::OpDesc>,
                   ops::GlobalGatherOpGradMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(global_gather,
-                       ops::GlobalGatherOpCPUKernel<float>,
-                       ops::GlobalGatherOpCPUKernel<double>,
-                       ops::GlobalGatherOpCPUKernel<int>,
-                       ops::GlobalGatherOpCPUKernel<int64_t>,
-                       ops::GlobalGatherOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_gather,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GlobalGatherOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.cu.cc b/paddle/fluid/operators/collective/global_gather_op.cu.cc
@@ -261,7 +261,7 @@ struct GlobalGatherProcessGroupFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeivceContext>
 class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -283,9 +283,12 @@ class GlobalGatherOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(global_gather,
-                        ops::GlobalGatherOpCUDAKernel<float>,
-                        ops::GlobalGatherOpCUDAKernel<double>,
-                        ops::GlobalGatherOpCUDAKernel<int>,
-                        ops::GlobalGatherOpCUDAKernel<int64_t>,
-                        ops::GlobalGatherOpCUDAKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_gather,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GlobalGatherOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_gather_op.h b/paddle/fluid/operators/collective/global_gather_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalGatherOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {

diff --git a/paddle/fluid/operators/collective/global_scatter_op.cc b/paddle/fluid/operators/collective/global_scatter_op.cc
@@ -115,9 +115,12 @@ REGISTER_OPERATOR(global_scatter,
                   ops::GlobalScatterOpGradMaker<paddle::framework::OpDesc>,
                   ops::GlobalScatterOpGradMaker<paddle::imperative::OpBase>)
 
-REGISTER_OP_CPU_KERNEL(global_scatter,
-                       ops::GlobalScatterOpCPUKernel<float>,
-                       ops::GlobalScatterOpCPUKernel<double>,
-                       ops::GlobalScatterOpCPUKernel<int>,
-                       ops::GlobalScatterOpCPUKernel<int64_t>,
-                       ops::GlobalScatterOpCPUKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_scatter,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GlobalScatterOpCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc
@@ -259,7 +259,7 @@ struct GlobalScatterProcessGroupFunctor<phi::GPUContext, T> {
   }
 };
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -281,9 +281,12 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_CUDA_KERNEL(global_scatter,
-                        ops::GlobalScatterOpCUDAKernel<float>,
-                        ops::GlobalScatterOpCUDAKernel<double>,
-                        ops::GlobalScatterOpCUDAKernel<int>,
-                        ops::GlobalScatterOpCUDAKernel<int64_t>,
-                        ops::GlobalScatterOpCUDAKernel<plat::float16>);
+PD_REGISTER_STRUCT_KERNEL(global_scatter,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::GlobalScatterOpCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/global_scatter_op.h b/paddle/fluid/operators/collective/global_scatter_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GlobalScatterOpCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {

diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
@@ -76,6 +76,8 @@ class MpAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 DECLARE_INPLACE_OP_INFERER(MpAllReduceSumInplaceInferer, {"X", "Out"});
 
+DEFINE_C_ALLREDUCE_CPU_KERNEL(MpAllReduceSum, kRedSum);
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -89,9 +91,12 @@ REGISTER_OPERATOR(mp_allreduce_sum,
                   ops::MpAllReduceSumOpMaker,
                   ops::MpAllReduceSumInplaceInferer);
 
-REGISTER_OP_CPU_KERNEL(mp_allreduce_sum,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MpAllReduceSumCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -15,16 +15,24 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 
+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(MpAllReduceSum, kRedSum)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    mp_allreduce_sum,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::MpAllReduceSumCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
 #if NCCL_VERSION_CODE >= 21000
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
+                          plat::float16) {
+}
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -78,15 +78,15 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
       load_combine,
       device_type,
       paddle::operators::
-          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, float>,
+          LoadCombineOpKernel<float, paddle::platform::CustomDeviceContext>,
       paddle::operators::
-          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, double>,
+          LoadCombineOpKernel<double, paddle::platform::CustomDeviceContext>,
       paddle::operators::
-          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, int>,
+          LoadCombineOpKernel<int, paddle::platform::CustomDeviceContext>,
       paddle::operators::
-          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, int8_t>,
+          LoadCombineOpKernel<int8_t, paddle::platform::CustomDeviceContext>,
       paddle::operators::
-          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, int64_t>);
+          LoadCombineOpKernel<int64_t, paddle::platform::CustomDeviceContext>);
 #endif
 }
 

diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -328,7 +328,7 @@ std::vector<phi::DenseTensor> SampleMaskForOneImage(
   return res;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GenerateMaskLabelsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -533,5 +533,9 @@ REGISTER_OPERATOR(
     ops::GenerateMaskLabelsOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(generate_mask_labels,
-                       ops::GenerateMaskLabelsKernel<float>);
+
+PD_REGISTER_STRUCT_KERNEL(generate_mask_labels,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GenerateMaskLabelsKernel,
+                          float) {}
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -510,7 +510,7 @@ std::vector<phi::DenseTensor> SampleRoisForOneImage(
   return res;
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
@@ -811,9 +811,12 @@ REGISTER_OPERATOR(
     ops::GenerateProposalLabelsOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(generate_proposal_labels,
-                       ops::GenerateProposalLabelsKernel<float>,
-                       ops::GenerateProposalLabelsKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(generate_proposal_labels,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::GenerateProposalLabelsKernel,
+                          float,
+                          double) {}
 
 REGISTER_OP_VERSION(generate_proposal_labels)
     .AddCheckpoint(

diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cc b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -114,6 +114,5 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(iou_similarity,
-                       ops::IOUSimilarityKernel<phi::CPUContext, float>,
-                       ops::IOUSimilarityKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    iou_similarity, CPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.cu b/paddle/fluid/operators/detection/iou_similarity_op.cu
@@ -15,6 +15,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/iou_similarity_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(iou_similarity,
-                        ops::IOUSimilarityKernel<phi::GPUContext, float>,
-                        ops::IOUSimilarityKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    iou_similarity, GPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
diff --git a/paddle/fluid/operators/detection/iou_similarity_op.h b/paddle/fluid/operators/detection/iou_similarity_op.h
@@ -105,7 +105,7 @@ struct IOUSimilarityFunctor {
 namespace paddle {
 namespace operators {
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class IOUSimilarityKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {

diff --git a/paddle/fluid/operators/detection/locality_aware_nms_op.cc b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -160,7 +160,7 @@ void GetMaxScoreIndexWithLocalityAware(
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class LocalityAwareNMSKernel : public framework::OpKernel<T> {
  public:
   void LocalityAwareNMSFast(phi::DenseTensor* bbox,
@@ -520,6 +520,9 @@ REGISTER_OPERATOR(
     ops::LocalityAwareNMSOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(locality_aware_nms,
-                       ops::LocalityAwareNMSKernel<float>,
-                       ops::LocalityAwareNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(locality_aware_nms,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::LocalityAwareNMSKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -49,7 +49,7 @@ inline MiningType GetMiningType(std::string str) {
   }
 }
 
-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MineHardExamplesKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
@@ -403,6 +403,9 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
-REGISTER_OP_CPU_KERNEL(mine_hard_examples,
-                       ops::MineHardExamplesKernel<phi::CPUContext, float>,
-                       ops::MineHardExamplesKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(mine_hard_examples,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MineHardExamplesKernel,
+                          float,
+                          double) {}
diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -143,7 +143,7 @@ void SliceOneClass(const platform::DeviceContext& ctx,
   }
 }
 
-template <typename T>
+template <typename T, typename DeviceContext>
 class MultiClassNMSKernel : public framework::OpKernel<T> {
  public:
   void NMSFast(const phi::DenseTensor& bbox,
@@ -629,6 +629,9 @@ class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
   }
 };
 
+template <typename T, typename DeviceContext>
+class MultiClassNMS2Kernel : public MultiClassNMSKernel<T, DeviceContext> {};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -643,18 +646,21 @@ REGISTER_OPERATOR(
     ops::MultiClassNMSOpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(multiclass_nms,
-                       ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(
+    multiclass_nms, CPU, ALL_LAYOUT, ops::MultiClassNMSKernel, float, double) {}
+
 REGISTER_OPERATOR(
     multiclass_nms2,
     ops::MultiClassNMS2Op,
     ops::MultiClassNMS2OpMaker,
     paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(multiclass_nms2,
-                       ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(multiclass_nms2,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MultiClassNMS2Kernel,
+                          float,
+                          double) {}
 
 REGISTER_OPERATOR(
     multiclass_nms3,