[Pten]Refactor the Elementwise_add Kernel (#37043)

* elementwise_add kernel refactor * fix compile bugs in elementwise_add refactor * fix compile bugs when run in npu/xpu * fix bugs when run unit test * fix bugs when run ci-windows * modify code as recommended * code format adjust * fix bugs when run ci * fix compile bug when run in ci-windwos
PaddlePaddle · Nov 12, 2021 · c131034 · c131034
1 parent 6bf208c
commit c131034
Show file tree

Hide file tree

Showing 36 changed files with 1,997 additions and 768 deletions.
diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -12,35 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class ElementwiseAddKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, AddFunctor<T>());
-  }
-};
-
 template <typename T>
 static __global__ void SimpleElemwiseAddGradCUDAKernel(
     const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h
@@ -20,6 +20,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
+#include "paddle/fluid/framework/pten_utils.h"
+
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
+
 namespace paddle {
 namespace operators {
 
@@ -55,12 +62,14 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
     auto *y = ctx.Input<framework::LoDTensor>("Y");
     auto *z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
-    if (x->dims() == y->dims()) {
-      SameDimsElemwiseAdd<DeviceContext, T> LaunchElementwiseCpuKernel;
-      LaunchElementwiseCpuKernel(ctx, x, y, z);
-    } else {
-      LaunchBroadcastElementwiseCpuKernel<DeviceContext, T>(ctx, x, y, z);
-    }
+
+    auto &dev_ctx = ctx.device_context<DeviceContext>();
+    int axis = ctx.Attr<int>("axis");
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
+    pten::ElementwiseAdd<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                            pt_z.get());
   }
 };
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -139,6 +139,17 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                                      tensor.place(), tensor.layout());
     }
   }
+
+  framework::KernelSignature GetExpectedPtenKernelArgs(
+      const framework::ExecutionContext &ctx) const override {
+    if (Type() == "elementwise_add") {
+      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        return framework::KernelSignature("elementwise_add", {"X", "Y"},
+                                          {"axis"}, {"Out"});
+      }
+    }
+    return framework::KernelSignature("None", {"X"}, {}, {"Out"});
+  }
 };
 
 class ElementwiseOpInferVarType

diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -162,215 +162,68 @@ struct DimensionsTransform {
   }
 };
 
-template <typename T, int VecSize, int Rank, bool IsBoundary = false>
-__device__ __forceinline__ void LoadData(
-    T *dst, const T *__restrict__ src, uint32_t block_offset,
-    const kps::details::BroadcastConfig<Rank> &config, int numel, int num,
-    bool need_broadcast) {
-  // numel : whole num of output
-  // num: how many data will be deal with in this time
-  if (need_broadcast) {
-    kps::ReadDataBc<T, VecSize, 1, 1, Rank, IsBoundary>(dst, src, block_offset,
-                                                        config, numel);
-  } else {
-    kps::ReadData<T, VecSize, 1, 1, IsBoundary>(dst, src + block_offset, num);
-  }
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize,
-          int Rank, bool IsBoundary = false>
-__device__ void DealSegment(
-    const framework::Array<const InT *__restrict__, Arity> &ins, OutT *out,
-    const framework::Array<bool, Arity> &use_broadcast, uint32_t numel,
-    const framework::Array<kps::details::BroadcastConfig<Rank>, Arity> &configs,
-    int num, Functor func) {
-  InT args[Arity][VecSize];
-  OutT result[VecSize];
-
-  int block_offset = blockIdx.x * blockDim.x * VecSize;
-
-#pragma unroll
-  for (int i = 0; i < Arity; i++) {
-    kps::Init<InT, VecSize>(args[i], static_cast<InT>(1.0f));
-    LoadData<InT, VecSize, Rank, IsBoundary>(args[i], ins[i], block_offset,
-                                             configs[i], numel, num,
-                                             use_broadcast[i]);
-  }
-
-  const bool kCallElementwiseAny =
-      platform::FunctionTraits<Functor>::has_pointer_args;
-  ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity,
-                             kCallElementwiseAny>()(func, args, result);
-  kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out + block_offset, result,
-                                                  num);
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize,
-          int Rank>
-__global__ void BroadcastKernel(
-    framework::Array<const InT *__restrict__, Arity> ins, OutT *out,
-    framework::Array<bool, Arity> use_broadcast, uint32_t numel,
-    framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs,
-    int main_tid, int tail_tid, Functor func) {
-  int block_offset = blockIdx.x * blockDim.x * VecSize;
-  // data offset of this block
-  if (blockIdx.x < main_tid) {
-    int num = blockDim.x * VecSize;  // blockIdx.x < main_tid
-    DealSegment<InT, OutT, Functor, Arity, VecSize, Rank, false>(
-        ins, out, use_broadcast, numel, configs, num, func);
-  } else {  // reminder
-    int num = tail_tid;
-    DealSegment<InT, OutT, Functor, Arity, VecSize, Rank, true>(
-        ins, out, use_broadcast, numel, configs, num, func);
-  }
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize,
-          int Rank>
-void LaunchKernel(const platform::CUDADeviceContext &ctx,
-                  const std::vector<const framework::Tensor *> &ins,
-                  framework::Tensor *out, Functor func,
-                  DimensionsTransform merge_dims) {
-  int numel = out->numel();
-  const int threads = 256;
-  int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads;
-
-  int main_tid = numel / (VecSize * threads);
-  int tail_tid = numel % (VecSize * threads);
-  auto stream = ctx.stream();
-  OutT *out_data = out->data<OutT>();
-
-  framework::Array<kps::details::BroadcastConfig<Rank>, Arity> configs;
-  framework::Array<bool, Arity> use_broadcast;
-  framework::Array<const InT *__restrict__, Arity> ins_data;
-
-  for (int i = 0; i < Arity; i++) {
-    use_broadcast[i] = (ins[i]->numel() != numel);
-    ins_data[i] = ins[i]->data<InT>();
-    if (use_broadcast[i]) {
-      // get the broadcast config,
-      // if data shape is[m, n], then you should set data_dim = {n, m}
-      // eg: out's shape [3, 45, 1]. then out_dims = {1, 45, 3}
-      configs[i] = kps::details::BroadcastConfig<Rank>(
-          merge_dims.out_dims, merge_dims.in_dims[i], merge_dims.dim_size);
-    }
-  }
-
-  BroadcastKernel<InT, OutT, Functor, Arity, VecSize,
-                  Rank><<<blocks, threads, 0, stream>>>(
-      ins_data, out_data, use_broadcast, numel, configs, main_tid, tail_tid,
-      func);
-}
-
-template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
-void LaunchBroadcastKernelForDifferentVecSize(
-    const platform::CUDADeviceContext &ctx,
-    const std::vector<const framework::Tensor *> &ins, framework::Tensor *out,
-    int axis, Functor func) {
-  const auto merge_dims = DimensionsTransform(ins, out->dims(), axis);
-
-#define CALL_BROADCAST_FOR_DIM_SIZE(rank)                                     \
-  case rank: {                                                                \
-    LaunchKernel<InT, OutT, Functor, Arity, VecSize, rank>(ctx, ins, out,     \
-                                                           func, merge_dims); \
-  } break;
-
-  switch (merge_dims.dim_size) {
-    CALL_BROADCAST_FOR_DIM_SIZE(1);
-    CALL_BROADCAST_FOR_DIM_SIZE(2);
-    CALL_BROADCAST_FOR_DIM_SIZE(3);
-    CALL_BROADCAST_FOR_DIM_SIZE(4);
-    CALL_BROADCAST_FOR_DIM_SIZE(5);
-    CALL_BROADCAST_FOR_DIM_SIZE(6);
-    CALL_BROADCAST_FOR_DIM_SIZE(7);
-    CALL_BROADCAST_FOR_DIM_SIZE(8);
-    default: {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "The maximum dimension of input tensor is expected to be less than "
-          "%d, but recieved %d.\n",
-          merge_dims.dim_size, framework::DDim::kMaxRank));
-    }
-  }
-#undef CALL_BROADCAST_FOR_DIM_SIZE
-}
-
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchBroadcastElementwiseCudaKernel(
     const platform::CUDADeviceContext &ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  using Traits = platform::FunctionTraits<Functor>;
-  const int kArity =
-      Traits::has_pointer_args ? static_cast<int>(ET) : Traits::arity;
-  PADDLE_ENFORCE_EQ(ins.size(), kArity,
-                    platform::errors::InvalidArgument(
-                        "The number of inputs is expected to be equal to the "
-                        "arity of functor. But recieved: the number of inputs "
-                        "is %d, the arity of functor is %d.",
-                        ins.size(), kArity));
-  PADDLE_ENFORCE_EQ(kArity, 2,
-                    platform::errors::InvalidArgument(
-                        "Currently only broadcast of binary is supported and "
-                        "verified, but received %d.",
-                        kArity));
-
-  int in_vec_size = 4;
-  framework::Tensor *out = (*outs)[0];
-  for (auto *in : ins) {
-    auto temp_size = platform::GetVectorizedSize<InT>(in->data<InT>());
-    in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size)
-                                            : in_vec_size;
+  std::vector<const pten::DenseTensor *> pt_inputs;
+  std::vector<pten::DenseTensor *> pt_outputs;
+  // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
+  // DenseTensor obj
+  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // can be deleted
+  // when DenseTensor support copy constructor.
+  std::vector<std::unique_ptr<pten::DenseTensor>> pt_inputs_tmp;
+  std::vector<std::unique_ptr<pten::DenseTensor>> pt_outputs_tmp;
+  for (auto in : ins) {
+    pt_inputs_tmp.emplace_back(
+        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
   }
-  int out_vec_size = platform::GetVectorizedSize<OutT>(out->data<OutT>());
-  int vec_size = std::min(out_vec_size, in_vec_size);
-
-  switch (vec_size) {
-    case 4: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 4>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    case 2: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 2>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    case 1: {
-      LaunchBroadcastKernelForDifferentVecSize<InT, OutT, Functor, kArity, 1>(
-          ctx, ins, out, axis, func);
-      break;
-    }
-    default: {
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported vectorized size: %d !", vec_size));
-      break;
-    }
+  for (auto out : *outs) {
+    pt_outputs_tmp.emplace_back(
+        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+  }
+  for (int i = 0; i < pt_inputs_tmp.size(); i++) {
+    pt_inputs.push_back(pt_inputs_tmp[i].get());
   }
+  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
+    pt_outputs.push_back(pt_outputs_tmp[i].get());
+  }
+  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(
+      ctx, pt_inputs, &pt_outputs, axis, func);
 }
 
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor>
 void LaunchElementwiseCudaKernel(
     const platform::CUDADeviceContext &cuda_ctx,
     const std::vector<const framework::Tensor *> &ins,
     std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  std::vector<int> dims_size;
-  bool no_broadcast_flag = true;
-  for (auto *in : ins) {
-    no_broadcast_flag = ins[0]->dims() == in->dims();
-    dims_size.emplace_back(in->dims().size());
+  std::vector<const pten::DenseTensor *> pt_inputs;
+  std::vector<pten::DenseTensor *> pt_outputs;
+  // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
+  // DenseTensor obj
+  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
+  // can be deleted
+  // when DenseTensor support copy constructor.
+  std::vector<std::unique_ptr<pten::DenseTensor>> pt_inputs_tmp;
+  std::vector<std::unique_ptr<pten::DenseTensor>> pt_outputs_tmp;
+  for (auto in : ins) {
+    pt_inputs_tmp.emplace_back(
+        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
   }
-
-  if (no_broadcast_flag) {
-    LaunchSameDimsElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
-                                                       func);
-  } else {
-    axis = axis == -1
-               ? *std::max_element(dims_size.begin(), dims_size.end()) -
-                     *std::min_element(dims_size.begin(), dims_size.end())
-               : axis;
-    LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, ins, outs,
-                                                        axis, func);
+  for (auto out : *outs) {
+    pt_outputs_tmp.emplace_back(
+        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
+  }
+  for (int i = 0; i < pt_inputs_tmp.size(); i++) {
+    pt_inputs.push_back(pt_inputs_tmp[i].get());
+  }
+  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
+    pt_outputs.push_back(pt_outputs_tmp[i].get());
   }
+  pten::LaunchElementwiseCudaKernel<ET, InT, OutT>(cuda_ctx, pt_inputs,
+                                                   &pt_outputs, axis, func);
 }
 
 }  // namespace operators