From 36a95654b149afa2ebc124171f0af7df314cff40 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 18 Nov 2021 15:28:26 +0800 Subject: [PATCH] [PTen]elementwise_sub kernel refactor (#37260) * elementwise_add kernel refactor * fix compile bugs in elementwise_add refactor * fix compile bugs when run in npu/xpu * fix bugs when run unit test * fix bugs when run ci-windows * modify code as recommended * code format adjust * fix bugs when run ci * fix compile bug when run in ci-windwos * elementwise_sub refactor * add PD_DLL_DECL for elementwise_sub * fix bugs when compilei --- .../operators/elementwise/elementwise_op.h | 6 +++ .../elementwise/elementwise_sub_op.cc | 27 ---------- .../elementwise/elementwise_sub_op.cu | 16 ------ .../elementwise/elementwise_sub_op.h | 26 +++++---- paddle/pten/api/include/math.h | 1 + paddle/pten/api/lib/math.cc | 34 ++++++++++++ paddle/pten/include/math.h | 15 ++++++ paddle/pten/kernels/cpu/math.cc | 33 ++++++++++++ paddle/pten/kernels/cpu/math.h | 7 +++ paddle/pten/kernels/cuda/math.cu | 26 +++++++++ paddle/pten/kernels/cuda/math.h | 7 +++ .../pten/kernels/functions/blas/elementwise.h | 9 ++++ .../kernels/functions/eigen/elementwise.h | 12 +++++ .../functions/general/elementwise_functor.h | 44 +++++++++++++++ paddle/pten/tests/api/test_elementwise_api.cc | 54 +++++++++++++++++++ .../tests/kernels/test_elementwise_dev_api.cc | 53 ++++++++++++++++++ 16 files changed, 313 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index e7a013e267d2b..0b4865b4e87f4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -148,6 +148,12 @@ class ElementwiseOp : public framework::OperatorWithKernel { {"axis"}, {"Out"}); } } + if (Type() == "elementwise_sub") { + if (ctx.InputVar("X")->IsType()) { + return framework::KernelSignature("elementwise_sub", {"X", "Y"}, + {"axis"}, {"Out"}); + } + } return framework::KernelSignature("None", {"X"}, {}, {"Out"}); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc index 84aa189b89e90..810f78ce80827 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc @@ -41,33 +41,6 @@ struct CPUPlace; namespace paddle { namespace operators { -template -struct SameDimsElemwiseSub< - platform::CPUDeviceContext, T, - typename std::enable_if::value>::type> { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z) { - auto blas = math::GetBlas(ctx); - blas.VSUB(x->numel(), x->data(), y->data(), z->data()); - } -}; - -template -struct SameDimsElemwiseSub< - platform::CPUDeviceContext, T, - typename std::enable_if::value>::type> { - void operator()(const framework::ExecutionContext &ctx, - const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z) { - auto eigen_x = framework::EigenVector::Flatten(*x); - auto eigen_y = framework::EigenVector::Flatten(*y); - auto eigen_z = framework::EigenVector::Flatten(*z); - auto &place = *ctx.template device_context() - .eigen_device(); - eigen_z.device(place) = eigen_x - eigen_y; - } -}; class ElementwiseSubOpMaker : public ElementwiseOpMaker { protected: std::string GetName() const override { return "Sub"; } diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu index 38465df243032..00562767c97a5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cu @@ -23,22 +23,6 @@ namespace plat = paddle::platform; namespace paddle { namespace operators { -template -class ElementwiseSubKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - std::vector ins; - std::vector outs; - const auto& cuda_ctx = - ctx.template device_context(); - - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - LaunchElementwiseCudaKernel( - cuda_ctx, ins, &outs, axis, SubFunctor()); - } -}; - template static __global__ void SimpleElemwiseSubGradCUDAKernel(const T* dout, int64_t size, T* dx, diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index fa26722266a63..94c8edf24a127 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -14,10 +14,15 @@ limitations under the License. */ #pragma once +#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" +// only can include the headers in paddle/pten/include dirs +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/include/core.h" +#include "paddle/pten/include/math.h" namespace paddle { namespace operators { @@ -37,13 +42,6 @@ void default_elementwise_sub(const framework::ExecutionContext& ctx, } } -template -struct SameDimsElemwiseSub { - void operator()(const framework::ExecutionContext& ctx, - const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z); -}; - template class ElementwiseSubKernel : public framework::OpKernel { public: @@ -53,13 +51,13 @@ class ElementwiseSubKernel : public framework::OpKernel { auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - auto dims_equal = x->dims() == y->dims(); - if (dims_equal) { - SameDimsElemwiseSub same_dims_sub; - same_dims_sub(ctx, x, y, z); - } else { - default_elementwise_sub(ctx, x, y, z); - } + auto& dev_ctx = ctx.device_context(); + int axis = ctx.Attr("axis"); + auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); + auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); + auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); + pten::ElementwiseSub(dev_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } }; diff --git a/paddle/pten/api/include/math.h b/paddle/pten/api/include/math.h index 41b4d2ffb3361..a49d6c116abc5 100644 --- a/paddle/pten/api/include/math.h +++ b/paddle/pten/api/include/math.h @@ -25,5 +25,6 @@ PD_DLL_DECL Tensor mean(const Tensor& x); PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y); +PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y); } // namespace experimental } // namespace paddle diff --git a/paddle/pten/api/lib/math.cc b/paddle/pten/api/lib/math.cc index be16e83219bc2..b86391a3e44df 100644 --- a/paddle/pten/api/lib/math.cc +++ b/paddle/pten/api/lib/math.cc @@ -96,6 +96,40 @@ PD_DLL_DECL Tensor add(const Tensor& x, const Tensor& y) { return out; } +PD_DLL_DECL Tensor subtract(const Tensor& x, const Tensor& y) { + // 1. Get kernel signature and kernel + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel = pten::KernelFactory::Instance().SelectKernelOrThrowError( + "elementwise_sub", kernel_key); + + // 2. Get Device Context + auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); + auto kernel_context = pten::KernelContext(dev_ctx); + + // 3. Auto data transform + auto dense_x = std::dynamic_pointer_cast(x.impl()); + kernel_context.EmplaceBackInput(dense_x); + auto dense_y = std::dynamic_pointer_cast(y.impl()); + kernel_context.EmplaceBackInput(dense_y); + kernel_context.EmplaceBackAttr(-1); + + // 4. InferShape + auto out_meta = ElementwiseInferShape(dense_x->meta(), dense_y->meta(), -1); + + // 5. Prepare outputs + Tensor out; + const auto allocator = std::make_shared( + pten::TransToFluidPlace(kernel_key.backend())); + auto dense_out = std::make_shared(allocator, out_meta); + kernel_context.EmplaceBackOutput(dense_out); + out.set_impl(dense_out); + + // 6. Call kernel + kernel(&kernel_context); + + return out; +} } // namespace experimental } // namespace paddle diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h index ef7f038f10cf3..72894dc74baed 100644 --- a/paddle/pten/include/math.h +++ b/paddle/pten/include/math.h @@ -87,4 +87,19 @@ DenseTensor ElementwiseAdd(const ContextT& dev_ctx, ElementwiseAdd(dev_ctx, x, y, axis, &dense_out); return dense_out; } + +template +DenseTensor Subtract(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferShape(x.meta(), y.meta(), axis); + const auto allocator = + std::make_shared( + dev_ctx.GetPlace()); + pten::DenseTensor dense_out(allocator, out_meta); + ElementwiseSub(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} + } // namespace pten diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc index d63292ba1f5de..4d194bc069fc7 100644 --- a/paddle/pten/kernels/cpu/math.cc +++ b/paddle/pten/kernels/cpu/math.cc @@ -85,6 +85,29 @@ void ElementwiseAdd(const CPUContext& dev_ctx, } } } + +template +void ElementwiseSub(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + if (x.dims() == y.dims()) { + SameDimsElementwiseCompute>()( + dev_ctx, x, y, out); + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::SubFunctor(), out); + } else { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::InverseSubFunctor(), out); + } + } +} + } // namespace pten // TODO(chenweihang): replace by better impl @@ -135,3 +158,13 @@ PT_REGISTER_KERNEL("elementwise_add", int64_t, complex64, complex128) {} +PT_REGISTER_KERNEL("elementwise_sub", + CPU, + ANY, + pten::ElementwiseSub, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h index 6d31f48e9d6e7..2cbf14c5f8776 100644 --- a/paddle/pten/kernels/cpu/math.h +++ b/paddle/pten/kernels/cpu/math.h @@ -53,4 +53,11 @@ void ElementwiseAdd(const CPUContext& dev_ctx, int axis, DenseTensor* out); +template +void ElementwiseSub(const CPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + } // namespace pten diff --git a/paddle/pten/kernels/cuda/math.cu b/paddle/pten/kernels/cuda/math.cu index 6a64290d39837..9191ad59ab2b8 100644 --- a/paddle/pten/kernels/cuda/math.cu +++ b/paddle/pten/kernels/cuda/math.cu @@ -139,6 +139,21 @@ void ElementwiseAdd(const CUDAContext& dev_ctx, dev_ctx, inputs, &outputs, axis, general::AddFunctor()); } +template +void ElementwiseSub(const CUDAContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + std::vector inputs; + std::vector outputs; + inputs.emplace_back(&x); + inputs.emplace_back(&y); + outputs.emplace_back(out); + LaunchElementwiseCudaKernel( + dev_ctx, inputs, &outputs, axis, general::SubFunctor()); +} + } // namespace pten // TODO(chenweihang): replace by better impl @@ -187,3 +202,14 @@ PT_REGISTER_KERNEL("elementwise_add", float16, complex64, complex128) {} +PT_REGISTER_KERNEL("elementwise_sub", + CUDA, + ANY, + pten::ElementwiseSub, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cuda/math.h b/paddle/pten/kernels/cuda/math.h index 0227daae66986..3d66991d6fd6d 100644 --- a/paddle/pten/kernels/cuda/math.h +++ b/paddle/pten/kernels/cuda/math.h @@ -55,6 +55,13 @@ void ElementwiseAdd(const CUDAContext& dev_ctx, int axis, DenseTensor* out); +template +void ElementwiseSub(const CUDAContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + } // namespace pten #endif diff --git a/paddle/pten/kernels/functions/blas/elementwise.h b/paddle/pten/kernels/functions/blas/elementwise.h index a0f80a8fa6a67..7c137e466d3af 100644 --- a/paddle/pten/kernels/functions/blas/elementwise.h +++ b/paddle/pten/kernels/functions/blas/elementwise.h @@ -29,5 +29,14 @@ void ElementwiseAdd(const DevCtx& dev_ctx, blas.VADD(x.numel(), x.data(), y.data(), out->mutable_data()); } +template +void ElementwiseSub(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VSUB(x.numel(), x.data(), y.data(), out->mutable_data()); +} + } // namespace blas } // namespace pten diff --git a/paddle/pten/kernels/functions/eigen/elementwise.h b/paddle/pten/kernels/functions/eigen/elementwise.h index 21a205622573b..e9854a2d5cda9 100644 --- a/paddle/pten/kernels/functions/eigen/elementwise.h +++ b/paddle/pten/kernels/functions/eigen/elementwise.h @@ -32,5 +32,17 @@ void ElementwiseAdd(const DevCtx& dev_ctx, eigen_z.device(place) = eigen_x + eigen_y; } +template +void ElementwiseSub(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*out); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x - eigen_y; +} + } // namespace eigen } // namespace pten diff --git a/paddle/pten/kernels/functions/general/elementwise_functor.h b/paddle/pten/kernels/functions/general/elementwise_functor.h index ab9960d101321..2342b68f18827 100644 --- a/paddle/pten/kernels/functions/general/elementwise_functor.h +++ b/paddle/pten/kernels/functions/general/elementwise_functor.h @@ -70,5 +70,49 @@ struct InverseAddFunctor { inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } }; +// Subtract +template +struct SameDimsSubFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsSubFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + blas::ElementwiseSub(dev_ctx, x, y, z); + } +}; + +template +struct SameDimsSubFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + eigen::ElementwiseSub(dev_ctx, x, y, z); + } +}; + +template +struct SubFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; +template +struct InverseSubFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } +}; + } // namespace general } // namespace pten diff --git a/paddle/pten/tests/api/test_elementwise_api.cc b/paddle/pten/tests/api/test_elementwise_api.cc index e9eda8d3b1886..8265624579d33 100644 --- a/paddle/pten/tests/api/test_elementwise_api.cc +++ b/paddle/pten/tests/api/test_elementwise_api.cc @@ -83,3 +83,57 @@ TEST(API, add) { ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); } + +// TODO(chenweihang): Remove this test after the API is used in the dygraph +TEST(API, subtract) { + // 1. create tensor + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + auto dense_x = std::make_shared( + alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x->mutable_data(); + + auto dense_y = std::make_shared( + alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y->mutable_data(); + + float sub[3][10] = {0.0}; + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + sub[i][j] = (i * 10 + j) * 1.0 - j * 2.0; + } + } + for (size_t i = 0; i < 10; ++i) { + dense_y_data[i] = i * 2.0; + } + paddle::experimental::Tensor x(dense_x); + paddle::experimental::Tensor y(dense_y); + + // 2. test API + auto out = paddle::experimental::subtract(x, y); + + // 3. check result + ASSERT_EQ(out.shape().size(), 2UL); + ASSERT_EQ(out.shape()[0], 3); + ASSERT_EQ(out.numel(), 30); + ASSERT_EQ(out.is_cpu(), true); + ASSERT_EQ(out.type(), pten::DataType::FLOAT32); + ASSERT_EQ(out.layout(), pten::DataLayout::NCHW); + ASSERT_EQ(out.initialized(), true); + + auto expect_result = sub; + auto dense_out = std::dynamic_pointer_cast(out.impl()); + auto actual_result0 = dense_out->data()[0]; + auto actual_result1 = dense_out->data()[1]; + auto actual_result2 = dense_out->data()[10]; + ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f); + ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); + ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); +} diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index f6b93b731865c..062021ccc601e 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -76,3 +76,56 @@ TEST(DEV_API, elementwise_add) { ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); } + +TEST(DEV_API, subtract) { + // 1. create tensor + const auto alloc = std::make_shared( + paddle::platform::CPUPlace()); + pten::DenseTensor dense_x(alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({3, 10}), + pten::DataLayout::NCHW)); + auto* dense_x_data = dense_x.mutable_data(); + + pten::DenseTensor dense_y(alloc, + pten::DenseTensorMeta(pten::DataType::FLOAT32, + framework::make_ddim({10}), + pten::DataLayout::NCHW)); + auto* dense_y_data = dense_y.mutable_data(); + + float sub[3][10] = {0.0}; + for (size_t i = 0; i < 3; ++i) { + for (size_t j = 0; j < 10; ++j) { + dense_x_data[i * 10 + j] = (i * 10 + j) * 1.0; + sub[i][j] = (i * 10 + j) * 1.0 - j * 2.0; + } + } + for (size_t i = 0; i < 10; ++i) { + dense_y_data[i] = i * 2.0; + } + int axis = 1; + paddle::platform::DeviceContextPool& pool = + paddle::platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(paddle::platform::CPUPlace()); + + // 2. test API + auto dense_out = pten::Subtract( + *(static_cast(dev_ctx)), + dense_x, + dense_y, + axis); + + // 3. check result + ASSERT_EQ(dense_out.dims().size(), 2); + ASSERT_EQ(dense_out.dims()[0], 3); + ASSERT_EQ(dense_out.meta().type, pten::DataType::FLOAT32); + ASSERT_EQ(dense_out.meta().layout, pten::DataLayout::NCHW); + + auto expect_result = sub; + auto actual_result0 = dense_out.data()[0]; + auto actual_result1 = dense_out.data()[1]; + auto actual_result2 = dense_out.data()[10]; + ASSERT_NEAR(expect_result[0][0], actual_result0, 1e-6f); + ASSERT_NEAR(expect_result[0][1], actual_result1, 1e-6f); + ASSERT_NEAR(expect_result[1][0], actual_result2, 1e-6f); +}