From 36a95654b149afa2ebc124171f0af7df314cff40 Mon Sep 17 00:00:00 2001
From: YuanRisheng <yuanrisheng@baidu.com>
Date: Thu, 18 Nov 2021 15:28:26 +0800
Subject: [PATCH] [PTen]elementwise_sub kernel refactor (#37260)

* elementwise_add kernel refactor

* fix compile bugs in elementwise_add refactor

* fix compile bugs when run in npu/xpu

* fix bugs when run unit test

* fix bugs when run ci-windows

* modify code as recommended

* code format adjust

* fix bugs when run ci

* fix compile bug when run in ci-windwos

* elementwise_sub refactor

* add PD_DLL_DECL for elementwise_sub

* fix bugs when compilei
---
 .../operators/elementwise/elementwise_op.h    |  6 +++
 .../elementwise/elementwise_sub_op.cc         | 27 ----------
 .../elementwise/elementwise_sub_op.cu         | 16 ------
 .../elementwise/elementwise_sub_op.h          | 26 +++++----
 paddle/pten/api/include/math.h                |  1 +
 paddle/pten/api/lib/math.cc                   | 34 ++++++++++++
 paddle/pten/include/math.h                    | 15 ++++++
 paddle/pten/kernels/cpu/math.cc               | 33 ++++++++++++
 paddle/pten/kernels/cpu/math.h                |  7 +++
 paddle/pten/kernels/cuda/math.cu              | 26 +++++++++
 paddle/pten/kernels/cuda/math.h               |  7 +++
 .../pten/kernels/functions/blas/elementwise.h |  9 ++++
 .../kernels/functions/eigen/elementwise.h     | 12 +++++
 .../functions/general/elementwise_functor.h   | 44 +++++++++++++++
 paddle/pten/tests/api/test_elementwise_api.cc | 54 +++++++++++++++++++
 .../tests/kernels/test_elementwise_dev_api.cc | 53 ++++++++++++++++++
 16 files changed, 313 insertions(+), 57 deletions(-)
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index e7a013e267d2b..0b4865b4e87f4 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -148,6 +148,12 @@ class ElementwiseOp : public framework::OperatorWithKernel {
                                           {"axis"}, {"Out"});
       }
     }
+    if (Type() == "elementwise_sub") {
+      if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
+        return framework::KernelSignature("elementwise_sub", {"X", "Y"},
+                                          {"axis"}, {"Out"});
+      }
+    }
     return framework::KernelSignature("None", {"X"}, {}, {"Out"});
   }
 };
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 84aa189b89e90..810f78ce80827 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -41,33 +41,6 @@ struct CPUPlace;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct SameDimsElemwiseSub<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    blas.VSUB(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-  }
-};
-
-template <typename T>
-struct SameDimsElemwiseSub<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-    auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    eigen_z.device(place) = eigen_x - eigen_y;
-  }
-};
 class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  protected:
   std::string GetName() const override { return "Sub"; }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
index 38465df243032..00562767c97a5 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu
@@ -23,22 +23,6 @@ namespace plat = paddle::platform;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-class ElementwiseSubKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-
-    int axis = PackTensorsIntoVector<T>(ctx, &ins, &outs);
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-        cuda_ctx, ins, &outs, axis, SubFunctor<T>());
-  }
-};
-
 template <typename T>
 static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout,
                                                        int64_t size, T* dx,
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
index fa26722266a63..94c8edf24a127 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h
@@ -14,10 +14,15 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 
+// only can include the headers in paddle/pten/include dirs
+#include "paddle/pten/api/lib/utils/tensor_utils.h"
+#include "paddle/pten/include/core.h"
+#include "paddle/pten/include/math.h"
 namespace paddle {
 namespace operators {
 
@@ -37,13 +42,6 @@ void default_elementwise_sub(const framework::ExecutionContext& ctx,
   }
 }
 
-template <typename DeviceContext, typename T, class Enable = void>
-struct SameDimsElemwiseSub {
-  void operator()(const framework::ExecutionContext& ctx,
-                  const framework::Tensor* x, const framework::Tensor* y,
-                  framework::Tensor* z);
-};
-
 template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
@@ -53,13 +51,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
     auto* z = ctx.Output<framework::LoDTensor>("Out");
     z->mutable_data<T>(ctx.GetPlace());
 
-    auto dims_equal = x->dims() == y->dims();
-    if (dims_equal) {
-      SameDimsElemwiseSub<DeviceContext, T> same_dims_sub;
-      same_dims_sub(ctx, x, y, z);
-    } else {
-      default_elementwise_sub<DeviceContext, T>(ctx, x, y, z);
-    }
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    int axis = ctx.Attr<int>("axis");
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
+    auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
+    pten::ElementwiseSub<T>(dev_ctx, *pt_x.get(), *pt_y.get(), axis,
+                            pt_z.get());
   }
 };
 
diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h
index 41b4d2ffb3361..a49d6c116abc5 100644
--- a/paddle/pten/api/include/math.h
+++ b/paddle/pten/api/include/math.h
@@ -25,5 +25,6 @@ PD_DLL_DECL Tensor mean(const Tensor& x);
 
 PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y);
 
+PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y);
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc
index be16e83219bc2..b86391a3e44df 100644
--- a/paddle/pten/api/lib/math.cc
+++ b/paddle/pten/api/lib/math.cc
@@ -96,6 +96,40 @@ PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) {
   return out;
 }
 
+PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y) {
+  // 1. Get kernel signature and kernel
+  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+  auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
+  auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError(
+      "elementwise_sub", kernel_key);
+
+  // 2. Get Device Context
+  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
+  auto kernel_context = pten::KernelContext(dev_ctx);
+
+  // 3. Auto data transform
+  auto dense_x = std::dynamic_pointer_cast<pten::DenseTensor>(x.impl());
+  kernel_context.EmplaceBackInput(dense_x);
+  auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
+  kernel_context.EmplaceBackInput(dense_y);
+  kernel_context.EmplaceBackAttr(-1);
+
+  // 4. InferShape
+  auto out_meta = ElementwiseInferShape(dense_x->meta(), dense_y->meta(), -1);
+
+  // 5. Prepare outputs
+  Tensor out;
+  const auto allocator = std::make_shared<DefaultAllocator>(
+      pten::TransToFluidPlace(kernel_key.backend()));
+  auto dense_out = std::make_shared<pten::DenseTensor>(allocator, out_meta);
+  kernel_context.EmplaceBackOutput(dense_out);
+  out.set_impl(dense_out);
+
+  // 6. Call kernel
+  kernel(&kernel_context);
+
+  return out;
+}
 }  // namespace experimental
 }  // namespace paddle
 
diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h
index ef7f038f10cf3..72894dc74baed 100644
--- a/paddle/pten/include/math.h
+++ b/paddle/pten/include/math.h
@@ -87,4 +87,19 @@ DenseTensor ElementwiseAdd(const ContextT& dev_ctx,
   ElementwiseAdd<T>(dev_ctx, x, y, axis, &dense_out);
   return dense_out;
 }
+
+template <typename T, typename ContextT>
+DenseTensor Subtract(const ContextT& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     int axis) {
+  auto out_meta = ElementwiseInferShape(x.meta(), y.meta(), axis);
+  const auto allocator =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          dev_ctx.GetPlace());
+  pten::DenseTensor dense_out(allocator, out_meta);
+  ElementwiseSub<T>(dev_ctx, x, y, axis, &dense_out);
+  return dense_out;
+}
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc
index d63292ba1f5de..4d194bc069fc7 100644
--- a/paddle/pten/kernels/cpu/math.cc
+++ b/paddle/pten/kernels/cpu/math.cc
@@ -85,6 +85,29 @@ void ElementwiseAdd(const CPUContext& dev_ctx,
     }
   }
 }
+
+template <typename T>
+void ElementwiseSub(const CPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out) {
+  if (x.dims() == y.dims()) {
+    SameDimsElementwiseCompute<general::SameDimsSubFunctor<CPUContext, T>>()(
+        dev_ctx, x, y, out);
+  } else {
+    auto x_dims = x.dims();
+    auto y_dims = y.dims();
+    if (x_dims.size() >= y_dims.size()) {
+      ElementwiseCompute<general::SubFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::SubFunctor<T>(), out);
+    } else {
+      ElementwiseCompute<general::InverseSubFunctor<T>, T>(
+          dev_ctx, x, y, axis, general::InverseSubFunctor<T>(), out);
+    }
+  }
+}
+
 }  // namespace pten
 
 // TODO(chenweihang): replace by better impl
@@ -135,3 +158,13 @@ PT_REGISTER_KERNEL("elementwise_add",
                    int64_t,
                    complex64,
                    complex128) {}
+PT_REGISTER_KERNEL("elementwise_sub",
+                   CPU,
+                   ANY,
+                   pten::ElementwiseSub,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h
index 6d31f48e9d6e7..2cbf14c5f8776 100644
--- a/paddle/pten/kernels/cpu/math.h
+++ b/paddle/pten/kernels/cpu/math.h
@@ -53,4 +53,11 @@ void ElementwiseAdd(const CPUContext& dev_ctx,
                     int axis,
                     DenseTensor* out);
 
+template <typename T>
+void ElementwiseSub(const CPUContext& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
 }  // namespace pten
diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu
index 6a64290d39837..9191ad59ab2b8 100644
--- a/paddle/pten/kernels/cuda/math.cu
+++ b/paddle/pten/kernels/cuda/math.cu
@@ -139,6 +139,21 @@ void ElementwiseAdd(const CUDAContext& dev_ctx,
       dev_ctx, inputs, &outputs, axis, general::AddFunctor<T>());
 }
 
+template <typename T>
+void ElementwiseSub(const CUDAContext& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out) {
+  std::vector<const DenseTensor*> inputs;
+  std::vector<DenseTensor*> outputs;
+  inputs.emplace_back(&x);
+  inputs.emplace_back(&y);
+  outputs.emplace_back(out);
+  LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+      dev_ctx, inputs, &outputs, axis, general::SubFunctor<T>());
+}
+
 }  // namespace pten
 
 // TODO(chenweihang): replace by better impl
@@ -187,3 +202,14 @@ PT_REGISTER_KERNEL("elementwise_add",
                    float16,
                    complex64,
                    complex128) {}
+PT_REGISTER_KERNEL("elementwise_sub",
+                   CUDA,
+                   ANY,
+                   pten::ElementwiseSub,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   float16,
+                   complex64,
+                   complex128) {}
diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h
index 0227daae66986..3d66991d6fd6d 100644
--- a/paddle/pten/kernels/cuda/math.h
+++ b/paddle/pten/kernels/cuda/math.h
@@ -55,6 +55,13 @@ void ElementwiseAdd(const CUDAContext& dev_ctx,
                     int axis,
                     DenseTensor* out);
 
+template <typename T>
+void ElementwiseSub(const CUDAContext& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    int axis,
+                    DenseTensor* out);
+
 }  // namespace pten
 
 #endif
diff --git a/paddle/pten/kernels/functions/blas/elementwise.h b/paddle/pten/kernels/functions/blas/elementwise.h
index a0f80a8fa6a67..7c137e466d3af 100644
--- a/paddle/pten/kernels/functions/blas/elementwise.h
+++ b/paddle/pten/kernels/functions/blas/elementwise.h
@@ -29,5 +29,14 @@ void ElementwiseAdd(const DevCtx& dev_ctx,
   blas.VADD(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
 }
 
+template <typename DevCtx, typename T>
+void ElementwiseSub(const DevCtx& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  auto blas = paddle::operators::math::GetBlas<DevCtx, T>(dev_ctx);
+  blas.VSUB(x.numel(), x.data<T>(), y.data<T>(), out->mutable_data<T>());
+}
+
 }  // namespace blas
 }  // namespace pten
diff --git a/paddle/pten/kernels/functions/eigen/elementwise.h b/paddle/pten/kernels/functions/eigen/elementwise.h
index 21a205622573b..e9854a2d5cda9 100644
--- a/paddle/pten/kernels/functions/eigen/elementwise.h
+++ b/paddle/pten/kernels/functions/eigen/elementwise.h
@@ -32,5 +32,17 @@ void ElementwiseAdd(const DevCtx& dev_ctx,
   eigen_z.device(place) = eigen_x + eigen_y;
 }
 
+template <typename DevCtx, typename T>
+void ElementwiseSub(const DevCtx& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& y,
+                    DenseTensor* out) {
+  auto eigen_x = pten::EigenVector<T>::Flatten(x);
+  auto eigen_y = pten::EigenVector<T>::Flatten(y);
+  auto eigen_z = pten::EigenVector<T>::Flatten(*out);
+  auto& place = *dev_ctx.eigen_device();
+  eigen_z.device(place) = eigen_x - eigen_y;
+}
+
 }  // namespace eigen
 }  // namespace pten
diff --git a/paddle/pten/kernels/functions/general/elementwise_functor.h b/paddle/pten/kernels/functions/general/elementwise_functor.h
index ab9960d101321..2342b68f18827 100644
--- a/paddle/pten/kernels/functions/general/elementwise_functor.h
+++ b/paddle/pten/kernels/functions/general/elementwise_functor.h
@@ -70,5 +70,49 @@ struct InverseAddFunctor {
   inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; }
 };
 
+// Subtract
+template <typename DevCtx, typename T, class Enable = void>
+struct SameDimsSubFunctor {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z);
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    blas::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
+  }
+};
+
+template <typename DevCtx, typename T>
+struct SameDimsSubFunctor<
+    DevCtx,
+    T,
+    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
+  void operator()(const DevCtx& dev_ctx,
+                  const DenseTensor& x,
+                  const DenseTensor& y,
+                  DenseTensor* z) {
+    eigen::ElementwiseSub<DevCtx, T>(dev_ctx, x, y, z);
+  }
+};
+
+template <typename T>
+struct SubFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; }
+};
+template <typename T>
+struct InverseSubFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; }
+};
+
 }  // namespace general
 }  // namespace pten
diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc
index e9eda8d3b1886..8265624579d33 100644
--- a/paddle/pten/tests/api/test_elementwise_api.cc
+++ b/paddle/pten/tests/api/test_elementwise_api.cc
@@ -83,3 +83,57 @@ TEST(API, add) {
   ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f);
   ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f);
 }
+
+// TODO(chenweihang): Remove this test after the API is used in the dygraph
+TEST(API, subtract) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({10}),
+                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  float sub[3][10] = {0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sub[i][j] = (i * 10 + j) * 1.0 - j * 2.0;
+    }
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    dense_y_data[i] = i * 2.0;
+  }
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::subtract(x, y);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2UL);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.numel(), 30);
+  ASSERT_EQ(out.is_cpu(), true);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto expect_result = sub;
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+  auto actual_result0 = dense_out->data<float>()[0];
+  auto actual_result1 = dense_out->data<float>()[1];
+  auto actual_result2 = dense_out->data<float>()[10];
+  ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f);
+}
diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
index f6b93b731865c..062021ccc601e 100644
--- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc
@@ -76,3 +76,56 @@ TEST(DEV_API, elementwise_add) {
   ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f);
   ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f);
 }
+
+TEST(DEV_API, subtract) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  pten::DenseTensor dense_x(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({3, 10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_x_data = dense_x.mutable_data<float>();
+
+  pten::DenseTensor dense_y(alloc,
+                            pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                                                  framework::make_ddim({10}),
+                                                  pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y.mutable_data<float>();
+
+  float sub[3][10] = {0.0};
+  for (size_t i = 0; i < 3; ++i) {
+    for (size_t j = 0; j < 10; ++j) {
+      dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0;
+      sub[i][j] = (i * 10 + j) * 1.0 - j * 2.0;
+    }
+  }
+  for (size_t i = 0; i < 10; ++i) {
+    dense_y_data[i] = i * 2.0;
+  }
+  int axis = 1;
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  auto* dev_ctx = pool.Get(paddle::platform::CPUPlace());
+
+  // 2. test API
+  auto dense_out = pten::Subtract<float>(
+      *(static_cast<paddle::platform::CPUDeviceContext*>(dev_ctx)),
+      dense_x,
+      dense_y,
+      axis);
+
+  // 3. check result
+  ASSERT_EQ(dense_out.dims().size(), 2);
+  ASSERT_EQ(dense_out.dims()[0], 3);
+  ASSERT_EQ(dense_out.meta().type, pten::DataType::FLOAT32);
+  ASSERT_EQ(dense_out.meta().layout, pten::DataLayout::NCHW);
+
+  auto expect_result = sub;
+  auto actual_result0 = dense_out.data<float>()[0];
+  auto actual_result1 = dense_out.data<float>()[1];
+  auto actual_result2 = dense_out.data<float>()[10];
+  ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f);
+  ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f);
+  ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f);
+}