Skip to content

Commit

Permalink
register fluid kerenls to phi [part 2] (#52044)
Browse files Browse the repository at this point in the history
* update bipartite_match

* update

* fix bug

* fix test

* fix bug

* fix Kunlun-KP-Build

* Revert "fix Kunlun-KP-Build"

This reverts commit ceab63c.

* update
  • Loading branch information
huangjiyi authored Mar 31, 2023
1 parent ffff133 commit d05b73e
Show file tree
Hide file tree
Showing 23 changed files with 190 additions and 111 deletions.
16 changes: 9 additions & 7 deletions paddle/fluid/operators/bpr_loss_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,17 +174,19 @@ class BprLossGradMaker : public framework::SingleGradOpMaker<T> {
} // namespace paddle

namespace ops = paddle::operators;
using CPUCtx = phi::CPUContext;

REGISTER_OPERATOR(bpr_loss,
ops::BprLossOp,
ops::BprLossOpMaker,
ops::BprLossGradMaker<paddle::framework::OpDesc>,
ops::BprLossGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp);
REGISTER_OP_CPU_KERNEL(bpr_loss,
ops::BprLossOpKernel<CPUCtx, float>,
ops::BprLossOpKernel<CPUCtx, double>);
REGISTER_OP_CPU_KERNEL(bpr_loss_grad,
ops::BprLossGradientOpKernel<CPUCtx, float>,
ops::BprLossGradientOpKernel<CPUCtx, double>);

PD_REGISTER_STRUCT_KERNEL(
bpr_loss, CPU, ALL_LAYOUT, ops::BprLossOpKernel, float, double) {}
PD_REGISTER_STRUCT_KERNEL(bpr_loss_grad,
CPU,
ALL_LAYOUT,
ops::BprLossGradientOpKernel,
float,
double) {}
4 changes: 2 additions & 2 deletions paddle/fluid/operators/bpr_loss_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ struct TolerableValue {
}
};

template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class BprLossOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down Expand Up @@ -83,7 +83,7 @@ class BprLossOpKernel : public framework::OpKernel<T> {
}
};

template <typename DeviceContext, typename T>
template <typename T, typename DeviceContext>
class BprLossGradientOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down
21 changes: 12 additions & 9 deletions paddle/fluid/operators/collective/c_allgather_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,15 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allgather,
ops::CAllGatherOp,
ops::CAllGatherOpMaker);

REGISTER_OP_CPU_KERNEL(c_allgather,
ops::CAllGatherOpCPUKernel<float>,
ops::CAllGatherOpCPUKernel<double>,
ops::CAllGatherOpCPUKernel<int>,
ops::CAllGatherOpCPUKernel<int64_t>,
ops::CAllGatherOpCPUKernel<uint8_t>,
ops::CAllGatherOpCPUKernel<int8_t>,
ops::CAllGatherOpCPUKernel<bool>,
ops::CAllGatherOpCPUKernel<plat::float16>);
PD_REGISTER_STRUCT_KERNEL(c_allgather,
CPU,
ALL_LAYOUT,
ops::CAllGatherOpCPUKernel,
float,
double,
int,
int8_t,
int64_t,
uint8_t,
bool,
plat::float16) {}
26 changes: 15 additions & 11 deletions paddle/fluid/operators/collective/c_allgather_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename T>
template <typename T, typename DeviceContext>
class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down Expand Up @@ -93,15 +93,19 @@ class CAllGatherOpCUDAKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_CUDA_KERNEL(c_allgather,
ops::CAllGatherOpCUDAKernel<float>,
ops::CAllGatherOpCUDAKernel<double>,
PD_REGISTER_STRUCT_KERNEL(c_allgather,
GPU,
ALL_LAYOUT,
ops::CAllGatherOpCUDAKernel,
float,
double,
#if NCCL_VERSION_CODE >= 21000
ops::CAllGatherOpCUDAKernel<plat::bfloat16>,
plat::bfloat16,
#endif
ops::CAllGatherOpCUDAKernel<int>,
ops::CAllGatherOpCUDAKernel<uint8_t>,
ops::CAllGatherOpCUDAKernel<int8_t>,
ops::CAllGatherOpCUDAKernel<int64_t>,
ops::CAllGatherOpCUDAKernel<bool>,
ops::CAllGatherOpCUDAKernel<plat::float16>);
int,
uint8_t,
int8_t,
int64_t,
bool,
plat::float16) {
}
2 changes: 1 addition & 1 deletion paddle/fluid/operators/collective/c_allgather_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ limitations under the License. */
namespace paddle {
namespace operators {

template <typename T>
template <typename T, typename DeviceContext>
class CAllGatherOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
Expand Down
18 changes: 11 additions & 7 deletions paddle/fluid/operators/collective/c_allreduce_max_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker {

DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"});

DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMax, kRedMax)

} // namespace operators
} // namespace paddle

Expand All @@ -45,10 +47,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max,
ops::CAllReduceOp,
ops::CAllReduceMaxOpMaker,
ops::AllreduceMaxInplaceInferer)

REGISTER_OP_CPU_KERNEL(c_allreduce_max,
ops::CAllReduceOpCPUKernel<ops::kRedMax, float>,
ops::CAllReduceOpCPUKernel<ops::kRedMax, double>,
ops::CAllReduceOpCPUKernel<ops::kRedMax, int>,
ops::CAllReduceOpCPUKernel<ops::kRedMax, int64_t>,
ops::CAllReduceOpCPUKernel<ops::kRedMax, plat::float16>);
PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
CPU,
ALL_LAYOUT,
ops::CAllReduceMaxCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
22 changes: 15 additions & 7 deletions paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@ limitations under the License. */

#include "paddle/fluid/operators/collective/c_allreduce_op.h"

namespace paddle {
namespace operators {
DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceMax, kRedMax)
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_CUDA_KERNEL(
c_allreduce_max,
ops::CAllReduceOpCUDAKernel<ops::kRedMax, float>,
ops::CAllReduceOpCUDAKernel<ops::kRedMax, double>,
ops::CAllReduceOpCUDAKernel<ops::kRedMax, int>,
ops::CAllReduceOpCUDAKernel<ops::kRedMax, int64_t>,
ops::CAllReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
GPU,
ALL_LAYOUT,
ops::CAllReduceMaxCUDAKernel,
float,
double,
int,
int64_t,
plat::float16) {}
17 changes: 11 additions & 6 deletions paddle/fluid/operators/collective/c_allreduce_min_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker {

DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"});

DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceMin, kRedMin)

} // namespace operators
} // namespace paddle

Expand All @@ -46,9 +48,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min,
ops::CAllReduceMinOpMaker,
ops::AllreduceMinInplaceInferer)

REGISTER_OP_CPU_KERNEL(c_allreduce_min,
ops::CAllReduceOpCPUKernel<ops::kRedMin, float>,
ops::CAllReduceOpCPUKernel<ops::kRedMin, double>,
ops::CAllReduceOpCPUKernel<ops::kRedMin, int>,
ops::CAllReduceOpCPUKernel<ops::kRedMin, int64_t>,
ops::CAllReduceOpCPUKernel<ops::kRedMin, plat::float16>);
PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
CPU,
ALL_LAYOUT,
ops::CAllReduceMinCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
22 changes: 15 additions & 7 deletions paddle/fluid/operators/collective/c_allreduce_min_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@ limitations under the License. */

#include "paddle/fluid/operators/collective/c_allreduce_op.h"

namespace paddle {
namespace operators {
DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceMin, kRedMin)
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_CUDA_KERNEL(
c_allreduce_min,
ops::CAllReduceOpCUDAKernel<ops::kRedMin, float>,
ops::CAllReduceOpCUDAKernel<ops::kRedMin, double>,
ops::CAllReduceOpCUDAKernel<ops::kRedMin, int>,
ops::CAllReduceOpCUDAKernel<ops::kRedMin, int64_t>,
ops::CAllReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(c_allreduce_min,
GPU,
ALL_LAYOUT,
ops::CAllReduceMinCUDAKernel,
float,
double,
int,
int64_t,
plat::float16) {}
8 changes: 8 additions & 0 deletions paddle/fluid/operators/collective/c_allreduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,10 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
}
};

#define DEFINE_C_ALLREDUCE_CPU_KERNEL(op_name, red_type) \
template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};

#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_nan or return false;
inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx,
Expand Down Expand Up @@ -527,6 +531,10 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
}
};

#define DEFINE_C_ALLREDUCE_CUDA_KERNEL(op_name, red_type) \
template <typename T, typename DeviceContext> \
class op_name##CUDAKernel : public CAllReduceOpCUDAKernel<red_type, T> {};

template <ReduceType red_type, typename T>
class CAllReduceOpMLUKernel : public framework::OpKernel<T> {
public:
Expand Down
17 changes: 11 additions & 6 deletions paddle/fluid/operators/collective/c_allreduce_prod_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker {

DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"});

DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceProd, kRedProd)

} // namespace operators
} // namespace paddle

Expand All @@ -46,9 +48,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod,
ops::CAllReduceProdOpMaker,
ops::AllreduceProdInplaceInferer)

REGISTER_OP_CPU_KERNEL(c_allreduce_prod,
ops::CAllReduceOpCPUKernel<ops::kRedProd, float>,
ops::CAllReduceOpCPUKernel<ops::kRedProd, double>,
ops::CAllReduceOpCPUKernel<ops::kRedProd, int>,
ops::CAllReduceOpCPUKernel<ops::kRedProd, int64_t>,
ops::CAllReduceOpCPUKernel<ops::kRedProd, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
CPU,
ALL_LAYOUT,
ops::CAllReduceProdCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
22 changes: 15 additions & 7 deletions paddle/fluid/operators/collective/c_allreduce_prod_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,21 @@ limitations under the License. */

#include "paddle/fluid/operators/collective/c_allreduce_op.h"

namespace paddle {
namespace operators {
DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceProd, kRedProd)
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_CUDA_KERNEL(
c_allreduce_prod,
ops::CAllReduceOpCUDAKernel<ops::kRedProd, float>,
ops::CAllReduceOpCUDAKernel<ops::kRedProd, double>,
ops::CAllReduceOpCUDAKernel<ops::kRedProd, int>,
ops::CAllReduceOpCUDAKernel<ops::kRedProd, int64_t>,
ops::CAllReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(c_allreduce_prod,
GPU,
ALL_LAYOUT,
ops::CAllReduceProdCUDAKernel,
float,
double,
int,
int64_t,
plat::float16) {}
17 changes: 11 additions & 6 deletions paddle/fluid/operators/collective/c_allreduce_sum_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker {

DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"});

DEFINE_C_ALLREDUCE_CPU_KERNEL(CAllReduceSum, kRedSum)

} // namespace operators
} // namespace paddle

Expand All @@ -67,9 +69,12 @@ REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_sum,
ops::CAllReduceSumOpMaker,
ops::AllreduceSumInplaceInferer)

REGISTER_OP_CPU_KERNEL(c_allreduce_sum,
ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
CPU,
ALL_LAYOUT,
ops::CAllReduceSumCPUKernel,
float,
double,
int,
int64_t,
plat::float16) {}
25 changes: 17 additions & 8 deletions paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,25 @@ limitations under the License. */

#include "paddle/fluid/operators/collective/c_allreduce_op.h"

namespace paddle {
namespace operators {
DEFINE_C_ALLREDUCE_CUDA_KERNEL(CAllReduceSum, kRedSum)
} // namespace operators
} // namespace paddle

namespace ops = paddle::operators;
namespace plat = paddle::platform;

REGISTER_OP_CUDA_KERNEL(
c_allreduce_sum,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
GPU,
ALL_LAYOUT,
ops::CAllReduceSumCUDAKernel,
float,
#if NCCL_VERSION_CODE >= 21000
ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
plat::bfloat16,
#endif
ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
double,
int,
int64_t,
plat::float16) {
}
12 changes: 8 additions & 4 deletions paddle/fluid/operators/detection/bipartite_match_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ bool DistPairDescend(std::tuple<int, int, T> pair1,
return std::get<2>(pair1) > std::get<2>(pair2);
}

template <typename T>
template <typename T, typename DeviceContext>
class BipartiteMatchKernel : public framework::OpKernel<T> {
public:
// The match_indices must be initialized to -1 at first.
Expand Down Expand Up @@ -318,6 +318,10 @@ REGISTER_OPERATOR(
ops::BipartiteMatchOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(bipartite_match,
ops::BipartiteMatchKernel<float>,
ops::BipartiteMatchKernel<double>);

PD_REGISTER_STRUCT_KERNEL(bipartite_match,
CPU,
ALL_LAYOUT,
ops::BipartiteMatchKernel,
float,
double) {}
Loading

0 comments on commit d05b73e

Please sign in to comment.