From 0f66a27b199af8e351e9543b763939c164be0123 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Wed, 29 Dec 2021 13:34:13 +0000 Subject: [PATCH 1/5] change 'math' to 'math_kernel' --- .../elementwise/elementwise_add_op.h | 4 +- .../elementwise/elementwise_div_op.h | 4 +- .../elementwise/elementwise_mul_op.h | 5 +- .../elementwise/elementwise_sub_op.h | 5 +- paddle/pten/CMakeLists.txt | 4 - paddle/pten/api/lib/kernel_declare.h | 6 - paddle/pten/include/math.h | 64 +------ paddle/pten/kernels/CMakeLists.txt | 9 +- paddle/pten/kernels/cpu/CMakeLists.txt | 1 - paddle/pten/kernels/cpu/math.cc | 149 --------------- paddle/pten/kernels/cpu/math.h | 93 --------- paddle/pten/kernels/cpu/math_kernel.cc | 178 ++++++++++++++++++ paddle/pten/kernels/gpu/CMakeLists.txt | 5 - paddle/pten/kernels/gpu/math.cu | 159 ---------------- paddle/pten/kernels/gpu/math.h | 90 --------- paddle/pten/kernels/gpu/math_kernel.cu | 177 +++++++++++++++++ paddle/pten/kernels/math_kernel.h | 124 ++++++++++++ .../tests/kernels/test_elementwise_dev_api.cc | 2 +- 18 files changed, 500 insertions(+), 579 deletions(-) delete mode 100644 paddle/pten/kernels/cpu/math.cc delete mode 100644 paddle/pten/kernels/cpu/math.h create mode 100644 paddle/pten/kernels/cpu/math_kernel.cc delete mode 100644 paddle/pten/kernels/gpu/math.cu delete mode 100644 paddle/pten/kernels/gpu/math.h create mode 100644 paddle/pten/kernels/gpu/math_kernel.cu create mode 100644 paddle/pten/kernels/math_kernel.h diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a4567beeb4f3d..d6d79d166d00a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -25,7 +25,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/include/core.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { @@ -68,7 +68,7 @@ class ElementwiseAddKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::Add(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + pten::AddKernel(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index f3ba5050c4f53..c886644bbdd1b 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -28,7 +28,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/include/core.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { @@ -62,7 +62,7 @@ class ElementwiseDivKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::Divide(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + pten::DivideKernel(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 8b43f82e6b6a1..3b0f072572210 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -24,7 +24,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/include/core.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { @@ -129,7 +129,8 @@ class ElementwiseMulKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); - pten::Multiply(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + pten::MultiplyKernel(dev_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "X's type[%s] is not supported by elementwise_op. X's type should be " diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 09a33788d4133..6a51d7c2a45ad 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -22,7 +22,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" #include "paddle/pten/include/core.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { @@ -56,7 +56,8 @@ class ElementwiseSubKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); - pten::Subtract(dev_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + pten::SubtractKernel(dev_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } }; diff --git a/paddle/pten/CMakeLists.txt b/paddle/pten/CMakeLists.txt index 7adfca40319b1..5cf02e1b11b40 100644 --- a/paddle/pten/CMakeLists.txt +++ b/paddle/pten/CMakeLists.txt @@ -28,10 +28,6 @@ get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) # keep this message for debug, remove it later if needless message(STATUS "All standard pten kernels: ${pten_kernels}") set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels}) -set(PTEN_DEPS ${PTEN_DEPS} math_cpu) set(PTEN_DEPS ${PTEN_DEPS} nary unary binary) -if(WITH_GPU OR WITH_ROCM) - set(PTEN_DEPS ${PTEN_DEPS} math_gpu) -endif() cc_library(pten SRCS all.cc DEPS ${PTEN_DEPS}) diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h index 484063df478aa..4d3143ef09ccc 100644 --- a/paddle/pten/api/lib/kernel_declare.h +++ b/paddle/pten/api/lib/kernel_declare.h @@ -19,9 +19,3 @@ limitations under the License. */ // TODO(chenweihang) After the kernel is split into a single file, // the kernel declare statement is automatically generated according to the // file name of the kernel, and this header file will be removed - -PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); -#endif diff --git a/paddle/pten/include/math.h b/paddle/pten/include/math.h index 876834cea7806..9abfa297a9452 100644 --- a/paddle/pten/include/math.h +++ b/paddle/pten/include/math.h @@ -18,8 +18,7 @@ limitations under the License. */ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/include/infermeta.h" #include "paddle/pten/kernels/complex_kernel.h" -#include "paddle/pten/kernels/cpu/math.h" -#include "paddle/pten/kernels/gpu/math.h" +#include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/kernels/scale_kernel.h" namespace pten { @@ -46,7 +45,7 @@ DenseTensor Mean(const ContextT& dev_ctx, dev_ctx.GetPlace()), std::move(out_meta)); bool reduce_all = false; - Mean(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out); + Mean(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out); return dense_out; } @@ -66,7 +65,8 @@ DenseTensor Sum(const ContextT& dev_ctx, // so use default value(false) is OK. bool reduce_all = false; - Sum(dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out); + Sum( + dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out); return dense_out; } @@ -85,62 +85,6 @@ DenseTensor Scale(const ContextT& dev_ctx, return dense_out; } -template -DenseTensor Add(const ContextT& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - Add(dev_ctx, x, y, axis, &dense_out); - return dense_out; -} - -template -DenseTensor Subtract(const ContextT& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - Subtract(dev_ctx, x, y, axis, &dense_out); - return dense_out; -} - -template -DenseTensor Divide(const ContextT& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - Divide(dev_ctx, x, y, axis, &dense_out); - return dense_out; -} - -template -DenseTensor Multiply(const ContextT& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis) { - auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); - pten::DenseTensor dense_out( - pten::make_intrusive( - dev_ctx.GetPlace()), - std::move(out_meta)); - Multiply(dev_ctx, x, y, axis, &dense_out); - return dense_out; -} - template DenseTensor Conj(const ContextT& dev_ctx, const DenseTensor& x) { auto out_meta = UnchangedInferMeta(x.meta()); diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index be4f0d5b191ff..6a5f32e5bd05d 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -24,10 +24,13 @@ endif() # pten depends all pten kernel targets set_property(GLOBAL PROPERTY PTEN_KERNELS "") -set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function) +set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas pten_transpose_gpu pten_transpose_cpu) + +set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) # auto build kernel targets by cmake -register_kernels(DEPS ${COMMON_KERNEL_DEPS}) +register_kernels(EXCLUDES math_kernel DEPS ${COMMON_KERNEL_DEPS}) +kernel_library(math_kernel DEPS ${MATH_KERNEL_DEPS}) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt index 9bf3df598e4c0..e69de29bb2d1d 100644 --- a/paddle/pten/kernels/cpu/CMakeLists.txt +++ b/paddle/pten/kernels/cpu/CMakeLists.txt @@ -1 +0,0 @@ -cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel) diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc deleted file mode 100644 index ee01cf65d02e5..0000000000000 --- a/paddle/pten/kernels/cpu/math.cc +++ /dev/null @@ -1,149 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/pten/kernels/cpu/math.h" - -#include "paddle/pten/api/ext/dispatch.h" -#include "paddle/pten/kernels/hybird/cpu/elementwise.h" -#include "paddle/pten/kernels/hybird/eigen/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" -#include "paddle/pten/kernels/hybird/general/reduce_impl.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" - -namespace pten { - -template -void Mean(const CPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - pten::general::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -template -void Divide(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out) { - // allocate memory for out - out->mutable_data(); - if (x.dims() == y.dims() && std::is_floating_point::value) { - SameDimsElementwiseCompute>()( - dev_ctx, x, y, out); - } else { - auto x_dims = x.dims(); - auto y_dims = y.dims(); - if (x_dims.size() >= y_dims.size()) { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::DivideFunctor(), out); - } else { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::InverseDivideFunctor(), out); - } - } -} - -template -void Sum(const CPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - pten::general::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -// Create the definition of Add -DEFINE_CPU_ELEMENTWISE_OP(Add) - -// Create the definition of Subtract -DEFINE_CPU_ELEMENTWISE_OP(Subtract) - -// Create the definition of Multiply -DEFINE_CPU_ELEMENTWISE_OP(Multiply) - -} // namespace pten - -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; - -// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 -// using bfloat16 = ::paddle::platform::bfloat16; -PT_REGISTER_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) {} -PT_REGISTER_KERNEL(add, - CPU, - ALL_LAYOUT, - pten::Add, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_KERNEL(subtract, - CPU, - ALL_LAYOUT, - pten::Subtract, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_KERNEL(divide, - CPU, - ALL_LAYOUT, - pten::Divide, - float, - double, - int, - int64_t, - complex64, - complex128) {} -PT_REGISTER_KERNEL(multiply, - CPU, - ALL_LAYOUT, - pten::Multiply, - float, - double, - int, - int64_t, - bool, - complex64, - complex128) {} -PT_REGISTER_KERNEL(sum, - CPU, - ALL_LAYOUT, - pten::Sum, - bool, - float, - double, - paddle::platform::float16, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} diff --git a/paddle/pten/kernels/cpu/math.h b/paddle/pten/kernels/cpu/math.h deleted file mode 100644 index 1a179218b4c4c..0000000000000 --- a/paddle/pten/kernels/cpu/math.h +++ /dev/null @@ -1,93 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/backends/cpu/cpu_context.h" -#include "paddle/pten/common/scalar.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/core/kernel_registry.h" - -namespace pten { - -template -void Mean(const CPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void Add(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Subtract(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Divide(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Multiply(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); -template -void Sum(const CPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -} // namespace pten - -#define DEFINE_CPU_ELEMENTWISE_OP(name) \ - template \ - void name(const CPUContext& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - out->mutable_data(); \ - if (x.dims() == y.dims()) { \ - SameDimsElementwiseCompute< \ - general::SameDims##name##Functor>()( \ - dev_ctx, x, y, out); \ - } else { \ - auto x_dims = x.dims(); \ - auto y_dims = y.dims(); \ - if (x_dims.size() >= y_dims.size()) { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::name##Functor(), out); \ - } else { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::Inverse##name##Functor(), out); \ - } \ - } \ - } diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc new file mode 100644 index 0000000000000..152d945144f6c --- /dev/null +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -0,0 +1,178 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/math_kernel.h" + +#include "paddle/pten/api/ext/dispatch.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/common/scalar.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/hybird/cpu/elementwise.h" +#include "paddle/pten/kernels/hybird/eigen/reduce.h" +#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" +#include "paddle/pten/kernels/hybird/general/reduce_impl.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/complex.h" + +namespace pten { + +#define DEFINE_CPU_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + out->mutable_data(); \ + if (x.dims() == y.dims()) { \ + SameDimsElementwiseCompute< \ + general::SameDims##name##Functor>()( \ + dev_ctx, x, y, out); \ + } else { \ + auto x_dims = x.dims(); \ + auto y_dims = y.dims(); \ + if (x_dims.size() >= y_dims.size()) { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, general::name##Functor(), out); \ + } else { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, general::Inverse##name##Functor(), out); \ + } \ + } \ + } + +template +void Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + pten::general::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +template +void DivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + // allocate memory for out + out->mutable_data(); + if (x.dims() == y.dims() && std::is_floating_point::value) { + SameDimsElementwiseCompute>()( + dev_ctx, x, y, out); + } else { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::DivideFunctor(), out); + } else { + ElementwiseCompute, T>( + dev_ctx, x, y, axis, general::InverseDivideFunctor(), out); + } + } +} + +template +void Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + pten::general::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +// Create the definition of Add +DEFINE_CPU_ELEMENTWISE_OP(Add) + +// Create the definition of Subtract +DEFINE_CPU_ELEMENTWISE_OP(Subtract) + +// Create the definition of Multiply +DEFINE_CPU_ELEMENTWISE_OP(Multiply) + +} // namespace pten + +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + +// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 +// using bfloat16 = ::paddle::platform::bfloat16; +PT_REGISTER_CTX_KERNEL(mean, CPU, ALL_LAYOUT, pten::Mean, float, double, bool) { +} +PT_REGISTER_CTX_KERNEL(add, + CPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(subtract, + CPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(divide, + CPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(multiply, + CPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(sum, + CPU, + ALL_LAYOUT, + pten::Sum, + bool, + float, + double, + paddle::platform::float16, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt index 51c666947b2f2..e69de29bb2d1d 100644 --- a/paddle/pten/kernels/gpu/CMakeLists.txt +++ b/paddle/pten/kernels/gpu/CMakeLists.txt @@ -1,5 +0,0 @@ -if(WITH_GPU) - nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel) -elseif(WITH_ROCM) - hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel copy_kernel) -endif() diff --git a/paddle/pten/kernels/gpu/math.cu b/paddle/pten/kernels/gpu/math.cu deleted file mode 100644 index e02403ac426f2..0000000000000 --- a/paddle/pten/kernels/gpu/math.cu +++ /dev/null @@ -1,159 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/pten/kernels/gpu/math.h" - -#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h" -#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" -#include "paddle/pten/kernels/hybird/general/reduce_impl.h" - -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/core/convert_utils.h" -#include "paddle/pten/core/kernel_registry.h" - -namespace kps = paddle::operators::kernel_primitives; - -namespace pten { - -/** - * Util Functors - */ - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) - : n_inv(static_cast(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -/** - * Kernels - */ - -template -void Mean(const GPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out) { - auto out_dtype = x.dtype(); - pten::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -// Create the definition of Add -DEFINE_CUDA_ELEMENTWISE_OP(Add) -// Create the definition of Subtract -DEFINE_CUDA_ELEMENTWISE_OP(Subtract) -// Create the definition of Multiply -DEFINE_CUDA_ELEMENTWISE_OP(Multiply) -// Create the definition of Divide -DEFINE_CUDA_ELEMENTWISE_OP(Divide) - -template -void Sum(const GPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out) { - pten::Reduce( - dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); -} - -} // namespace pten - -using float16 = paddle::platform::float16; -using complex64 = ::paddle::platform::complex; -using complex128 = ::paddle::platform::complex; - -PT_REGISTER_KERNEL( - mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {} -PT_REGISTER_KERNEL(add, - GPU, - ALL_LAYOUT, - pten::Add, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_KERNEL(subtract, - GPU, - ALL_LAYOUT, - pten::Subtract, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_KERNEL(divide, - GPU, - ALL_LAYOUT, - pten::Divide, - float, - double, - int, - int64_t, - float16, - complex64, - complex128) {} -PT_REGISTER_KERNEL(multiply, - GPU, - ALL_LAYOUT, - pten::Multiply, - float, - double, - int, - int64_t, - bool, - float16, - complex64, - complex128) {} -PT_REGISTER_KERNEL(sum, - GPU, - ALL_LAYOUT, - pten::Sum, - bool, - float, - double, - float16, - int, - int64_t, - complex64, - complex128) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} diff --git a/paddle/pten/kernels/gpu/math.h b/paddle/pten/kernels/gpu/math.h deleted file mode 100644 index c1d33a0fcdd09..0000000000000 --- a/paddle/pten/kernels/gpu/math.h +++ /dev/null @@ -1,90 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -// CUDA and HIP use same api -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - -#include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/common/scalar.h" -#include "paddle/pten/core/dense_tensor.h" - -namespace pten { - -template -void Mean(const GPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DenseTensor* out); - -template -void Add(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Subtract(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Divide(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Multiply(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - int axis, - DenseTensor* out); - -template -void Sum(const GPUContext& dev_ctx, - const DenseTensor& x, - const std::vector& dims, - bool keep_dim, - bool reduce_all, - DataType out_dtype, - DenseTensor* out); - -} // namespace pten - -#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ - template \ - void name(const GPUContext& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - std::vector inputs; \ - std::vector outputs; \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ - out->mutable_data(); \ - LaunchElementwiseCudaKernel( \ - dev_ctx, inputs, &outputs, axis, general::name##Functor()); \ - } - -#endif diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu new file mode 100644 index 0000000000000..636d0f16b0d71 --- /dev/null +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -0,0 +1,177 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/pten/kernels/math_kernel.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h" +#include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h" +#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" +#include "paddle/pten/kernels/hybird/general/reduce_impl.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/complex.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/api/lib/utils/tensor_utils.h" +#include "paddle/pten/core/convert_utils.h" +#include "paddle/pten/core/kernel_registry.h" + +namespace kps = paddle::operators::kernel_primitives; + +namespace pten { + +#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + std::vector inputs; \ + std::vector outputs; \ + inputs.emplace_back(&x); \ + inputs.emplace_back(&y); \ + outputs.emplace_back(out); \ + out->mutable_data(); \ + LaunchElementwiseCudaKernel( \ + dev_ctx, inputs, &outputs, axis, general::name##Functor()); \ + } + +/** + * Util Functors + */ + +template +struct DivideFunctor { + HOSTDEVICE explicit inline DivideFunctor(int n) + : n_inv(static_cast(1.0 / n)) {} + + HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } + + private: + T n_inv; +}; + +/** + * Kernels + */ + +template +void Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out) { + auto out_dtype = x.dtype(); + pten::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +// Create the definition of Add +DEFINE_CUDA_ELEMENTWISE_OP(Add) +// Create the definition of Subtract +DEFINE_CUDA_ELEMENTWISE_OP(Subtract) +// Create the definition of Multiply +DEFINE_CUDA_ELEMENTWISE_OP(Multiply) +// Create the definition of Divide +DEFINE_CUDA_ELEMENTWISE_OP(Divide) + +template +void Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out) { + pten::Reduce( + dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); +} + +} // namespace pten + +using float16 = paddle::platform::float16; +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + +PT_REGISTER_CTX_KERNEL( + mean, GPU, ALL_LAYOUT, pten::Mean, float, double, bool, float16) {} +PT_REGISTER_CTX_KERNEL(add, + GPU, + ALL_LAYOUT, + pten::AddKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(subtract, + GPU, + ALL_LAYOUT, + pten::SubtractKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(divide, + GPU, + ALL_LAYOUT, + pten::DivideKernel, + float, + double, + int, + int64_t, + float16, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(multiply, + GPU, + ALL_LAYOUT, + pten::MultiplyKernel, + float, + double, + int, + int64_t, + bool, + float16, + complex64, + complex128) {} +PT_REGISTER_CTX_KERNEL(sum, + GPU, + ALL_LAYOUT, + pten::Sum, + bool, + float, + double, + float16, + int, + int64_t, + complex64, + complex128) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} diff --git a/paddle/pten/kernels/math_kernel.h b/paddle/pten/kernels/math_kernel.h new file mode 100644 index 0000000000000..2968aa3524a9f --- /dev/null +++ b/paddle/pten/kernels/math_kernel.h @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/api/lib/utils/storage.h" +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/include/infermeta.h" + +namespace pten { + +template +void Mean(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DenseTensor* out); + +template +void AddKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void SubtractKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void DivideKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void MultiplyKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +template +void Sum(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& dims, + bool keep_dim, + bool reduce_all, + DataType out_dtype, + DenseTensor* out); + +template +DenseTensor Add(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); + pten::DenseTensor dense_out( + pten::make_intrusive( + dev_ctx.GetPlace()), + std::move(out_meta)); + AddKernel(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} + +template +DenseTensor Subtract(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); + pten::DenseTensor dense_out( + pten::make_intrusive( + dev_ctx.GetPlace()), + std::move(out_meta)); + SubtractKernel(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} + +template +DenseTensor Divide(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); + pten::DenseTensor dense_out( + pten::make_intrusive( + dev_ctx.GetPlace()), + std::move(out_meta)); + DivideKernel(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} + +template +DenseTensor Multiply(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis) { + auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); + pten::DenseTensor dense_out( + pten::make_intrusive( + dev_ctx.GetPlace()), + std::move(out_meta)); + MultiplyKernel(dev_ctx, x, y, axis, &dense_out); + return dense_out; +} + +} // namespace pten diff --git a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc index f12a2d48e6b2b..bd09ecb770a5d 100644 --- a/paddle/pten/tests/kernels/test_elementwise_dev_api.cc +++ b/paddle/pten/tests/kernels/test_elementwise_dev_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/api/lib/utils/allocator.h" #include "paddle/pten/core/dense_tensor.h" From 986723227d3e55bc00e88d6f5f1442c2a477669f Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 30 Dec 2021 05:16:52 +0000 Subject: [PATCH 2/5] fix compile bugs --- paddle/fluid/operators/cholesky_solve_op.h | 4 ++-- paddle/fluid/operators/elementwise/elementwise_mul_op.cu | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h index f3b0056165426..94b68bff8f446 100644 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ b/paddle/fluid/operators/cholesky_solve_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/platform/complex.h" -#include "paddle/pten/include/math.h" +#include "paddle/pten/kernels/math_kernel.h" namespace paddle { namespace operators { // namespace operators @@ -205,7 +205,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { auto pt_x = paddle::experimental::MakePtenDenseTensor(commonterm); auto pt_y = paddle::experimental::MakePtenDenseTensor(commonterm_conj); auto pt_z = paddle::experimental::MakePtenDenseTensor(commonterm); - pten::Add(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get()); + pten::AddKernel(dev_ctx, *pt_x.get(), *pt_y.get(), -1, pt_z.get()); auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false); auto mat_dim_c = diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index e131bc4974661..12e0062a698be 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -57,7 +57,8 @@ class ElementwiseMulKernel auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); - pten::Multiply(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pt_z.get()); + pten::MultiplyKernel(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, + pt_z.get()); } else { PADDLE_THROW(platform::errors::InvalidArgument( "X's type[%s] is not supported by elementwise_op. X's type should be " From 1602536cac7be2581e6c76882b953bccc8123f67 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 30 Dec 2021 06:31:56 +0000 Subject: [PATCH 3/5] merge develop --- paddle/pten/kernels/CMakeLists.txt | 2 +- paddle/pten/kernels/cpu/CMakeLists.txt | 1 + paddle/pten/kernels/cpu/math.cc | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 1 deletion(-) create mode 100644 paddle/pten/kernels/cpu/math.cc diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index 70c187dbeb241..e80e91209e890 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -29,7 +29,7 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) -if(WITH_GPU or WITH_ROCM) +if(WITH_GPU OR WITH_ROCM) set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu) else() set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_cpu) diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt index e69de29bb2d1d..9bf3df598e4c0 100644 --- a/paddle/pten/kernels/cpu/CMakeLists.txt +++ b/paddle/pten/kernels/cpu/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel) diff --git a/paddle/pten/kernels/cpu/math.cc b/paddle/pten/kernels/cpu/math.cc new file mode 100644 index 0000000000000..b4642d475d566 --- /dev/null +++ b/paddle/pten/kernels/cpu/math.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +namespace pten {} // namespace pten From 0f5ae09b34cdfd44e349d77dec7a86ba24018579 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Thu, 30 Dec 2021 06:46:29 +0000 Subject: [PATCH 4/5] fix compile bugs --- paddle/pten/kernels/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/pten/kernels/CMakeLists.txt b/paddle/pten/kernels/CMakeLists.txt index e80e91209e890..4c705767f4c2f 100644 --- a/paddle/pten/kernels/CMakeLists.txt +++ b/paddle/pten/kernels/CMakeLists.txt @@ -28,11 +28,9 @@ set(COMMON_KERNEL_DEPS dense_tensor kernel_context kernel_factory convert_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) -set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) +set(MATH_KERNEL_DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel pten_transpose_cpu) if(WITH_GPU OR WITH_ROCM) set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_gpu) -else() - set(MATH_KERNEL_DEPS ${MATH_KERNEL_DEPS} pten_transpose_cpu) endif() # auto build kernel targets by cmake From cfd79d42fae393872dc9298254d86272e02bef3f Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 31 Dec 2021 12:01:56 +0000 Subject: [PATCH 5/5] move cpu_impl of elementwise kernel to new directory --- .../framework/data_device_transform_test.cu | 12 +- .../elementwise/elementwise_functor.h | 52 +-- .../elementwise/elementwise_op_function.h | 89 +--- paddle/pten/api/lib/kernel_declare.h | 21 - paddle/pten/infermeta/binary.cc | 16 +- paddle/pten/kernels/cpu/elementwise_impl.h | 392 ++++++++++++++++++ paddle/pten/kernels/cpu/math_kernel.cc | 61 +-- .../general => funcs}/elementwise_base.h | 5 +- .../pten/kernels/funcs/elementwise_functor.h | 83 ++++ paddle/pten/kernels/gpu/math_kernel.cu | 32 +- paddle/pten/kernels/hybird/CMakeLists.txt | 1 - .../pten/kernels/hybird/blas/CMakeLists.txt | 0 paddle/pten/kernels/hybird/blas/elementwise.h | 59 --- paddle/pten/kernels/hybird/cpu/CMakeLists.txt | 0 paddle/pten/kernels/hybird/cpu/elementwise.h | 230 ---------- .../cuda/elementwise/elementwise_common.cu.h | 2 +- .../pten/kernels/hybird/eigen/elementwise.h | 61 --- .../hybird/general/elementwise_functor.h | 223 ---------- python/paddle/utils/code_gen/api_gen.py | 1 - 19 files changed, 568 insertions(+), 772 deletions(-) delete mode 100644 paddle/pten/api/lib/kernel_declare.h create mode 100644 paddle/pten/kernels/cpu/elementwise_impl.h rename paddle/pten/kernels/{hybird/general => funcs}/elementwise_base.h (99%) create mode 100644 paddle/pten/kernels/funcs/elementwise_functor.h delete mode 100644 paddle/pten/kernels/hybird/blas/CMakeLists.txt delete mode 100644 paddle/pten/kernels/hybird/blas/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/cpu/CMakeLists.txt delete mode 100644 paddle/pten/kernels/hybird/cpu/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/eigen/elementwise.h delete mode 100644 paddle/pten/kernels/hybird/general/elementwise_functor.h diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index 4e5be2e53503f..a81e4abd45e56 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -23,6 +23,9 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/init.h" +#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/pten/include/core.h" + namespace paddle { namespace framework { @@ -73,9 +76,12 @@ class TestKernel : public OpKernel { output->Resize(input->dims()); output->mutable_data(ctx.GetPlace()); - operators::TransformFunctor, T, DeviceContext> functor( - input, input, output, ctx.template device_context(), - AddFunctor()); + auto pt_input = paddle::experimental::MakePtenDenseTensor(*input); + auto pt_out = paddle::experimental::MakePtenDenseTensor(*output); + + pten::funcs::TransformFunctor, T, DeviceContext> functor( + *pt_input, *pt_input, pt_out.get(), + ctx.template device_context(), AddFunctor()); functor.Run(); } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h index 6e53af41b657c..7ff8e6a1543af 100644 --- a/paddle/fluid/operators/elementwise/elementwise_functor.h +++ b/paddle/fluid/operators/elementwise/elementwise_functor.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/hostdevice.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" namespace paddle { namespace operators { @@ -25,58 +26,31 @@ namespace operators { // Add template -struct AddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } -}; +using AddFunctor = pten::funcs::AddFunctor; + template -struct InverseAddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } -}; +using InverseAddFunctor = pten::funcs::InverseAddFunctor; // Subtract template -struct SubFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } -}; +using SubFunctor = pten::funcs::SubtractFunctor; + template -struct InverseSubFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } -}; +using InverseSubFunctor = pten::funcs::InverseSubtractFunctor; // Multiply template -struct MulFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } -}; +using MulFunctor = pten::funcs::MultiplyFunctor; + template -struct InverseMulFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } -}; +using InverseMulFunctor = pten::funcs::InverseMultiplyFunctor; // Divide -#define DIV_ERROR_INFO \ - "InvalidArgumentError: Integer division by zero encountered in " \ - "(floor) divide. Please check the input value." - -template -struct DivFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } -}; - template -struct DivFunctor::value>::type> { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); - return a / b; - } -}; +using DivFunctor = pten::funcs::DivideFunctor; -template -struct InverseDivFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } -}; +template +using InverseDivFunctor = pten::funcs::InverseDivideFunctor; // Floor Divide template diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 9700ca3584de8..6f3e17ea4d434 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -31,8 +31,7 @@ limitations under the License. */ // only can include the headers in paddle/pten/include dirs #include "paddle/pten/api/lib/utils/tensor_utils.h" -#include "paddle/pten/kernels/hybird/cpu/elementwise.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/cpu/elementwise_impl.h" #if defined(__NVCC__) || defined(__HIPCC__) #ifdef __NVCC__ @@ -151,9 +150,9 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, int *x_dims_array, int *y_dims_array, int *out_dims_array, const int max_dim, const int axis) { - pten::general::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array, - y_dims_array, out_dims_array, max_dim, - axis); + pten::funcs::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array, + y_dims_array, out_dims_array, max_dim, + axis); } template @@ -1073,71 +1072,9 @@ void CommonGradBroadcastCUDA( inline framework::DDim trim_trailing_singular_dims( const framework::DDim &dims) { - return pten::general::trim_trailing_singular_dims(dims); + return pten::funcs::trim_trailing_singular_dims(dims); } -template -class TransformFunctor { - public: - TransformFunctor(const framework::Tensor *x, const framework::Tensor *y, - framework::Tensor *z, const DeviceContext &ctx, Functor func, - const bool is_xsize_larger = true) - : x_(x->data()), - y_(y->data()), - z_(z->mutable_data(ctx.GetPlace())), - nx_(x->numel()), - ctx_(ctx), - func_(func), - is_xsize_larger_(is_xsize_larger) { - if (is_xsize_larger_ == false) { - nx_ = y->numel(); - } - } - - inline void Run() const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, y_, z_, func_); - } - - inline void RunRowWise(int n, int pre) const { - platform::Transform trans; - if (is_xsize_larger_) { - trans(ctx_, x_, x_ + nx_, - pten::general::RowwiseTransformIterator(y_, n), - z_, func_); - } else { - trans(ctx_, y_, y_ + nx_, - pten::general::RowwiseTransformIterator(x_, n), - z_, func_); - } - } - - inline void RunMidWise(int n, int pre, int post) const { - platform::Transform trans; - if (is_xsize_larger_) { - trans(ctx_, x_, x_ + nx_, - pten::general::MidWiseTransformIterator(y_, n, - post), - z_, func_); - } else { - trans(ctx_, y_, y_ + nx_, - pten::general::MidWiseTransformIterator(x_, n, - post), - z_, func_); - } - } - - private: - const T *x_; - const T *y_; - OutType *z_; - int64_t nx_; - const DeviceContext &ctx_; - Functor func_; - bool is_xsize_larger_; -}; - template struct ElemwiseGradNoBroadcast { const T *x_; @@ -1457,13 +1394,13 @@ void ElemwiseGradComputeWithBroadcast( if (is_xsize_larger) { auto y_dims_trimed = trim_trailing_singular_dims(y_dims); axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - pten::general::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, - &post, &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dims, y_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); } else { auto x_dims_trimed = trim_trailing_singular_dims(x_dims); axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - pten::general::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, - &post, &is_run_common_broadcast); + pten::funcs::get_mid_dims(y_dims, x_dims_trimed, axis_trim, &pre, &n, &post, + &is_run_common_broadcast); } // special case for common backward implementation. if (is_run_common_broadcast) { @@ -1861,8 +1798,8 @@ void FusedElemwiseAndActComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); if (post == 1) { int h = pre; int w = n; @@ -2409,8 +2346,8 @@ void FusedElemwiseAndActGradComputeWithBroadcast( axis = (y_dim.size() == 0) ? x_dim.size() : axis; int pre, n, post, is_run_common_broadcast; - pten::general::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, - &is_run_common_broadcast); + pten::funcs::get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post, + &is_run_common_broadcast); const T *x_data = nullptr; const T *y_data = nullptr; if (x->IsInitialized()) x_data = x->data(); diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h deleted file mode 100644 index 4d3143ef09ccc..0000000000000 --- a/paddle/pten/api/lib/kernel_declare.h +++ /dev/null @@ -1,21 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/kernel_registry.h" - -// TODO(chenweihang) After the kernel is split into a single file, -// the kernel declare statement is automatically generated according to the -// file name of the kernel, and this header file will be removed diff --git a/paddle/pten/infermeta/binary.cc b/paddle/pten/infermeta/binary.cc index 5d3844a1dec3d..944c64ecd75e2 100644 --- a/paddle/pten/infermeta/binary.cc +++ b/paddle/pten/infermeta/binary.cc @@ -14,7 +14,7 @@ limitations under the License. */ // See Note [ Why still include the fluid headers? ] #include "paddle/pten/infermeta/binary.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" namespace pten { @@ -162,13 +162,13 @@ DenseTensorMeta ElementwiseInferMeta(const DenseTensorMeta& x_meta, std::vector x_dims_array(max_dim); std::vector y_dims_array(max_dim); std::vector out_dims_array(max_dim); - general::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); return_meta.dims = paddle::framework::make_ddim(out_dims_array); } return_meta.lod = x_meta.lod; diff --git a/paddle/pten/kernels/cpu/elementwise_impl.h b/paddle/pten/kernels/cpu/elementwise_impl.h new file mode 100644 index 0000000000000..d3687b22fb392 --- /dev/null +++ b/paddle/pten/kernels/cpu/elementwise_impl.h @@ -0,0 +1,392 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" + +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/pten/kernels/hybird/eigen/common.h" + +namespace pten { + +// Add +template +struct SameDimsAddFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VADD(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsAddFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + z->mutable_data(); + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x + eigen_y; + } +}; + +// Subtract +template +struct SameDimsSubtractFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsSubtractFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VSUB(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsSubtractFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x - eigen_y; + } +}; + +// Divide +template +struct SameDimsDivideFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsDivideFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + paddle::platform::errors::InvalidArgument( + "If use SameDimsDivideFunctor, template args(T) must be floating " + "point. "); + } +}; + +template +struct SameDimsDivideFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VDIV(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +// Multiply +template +struct SameDimsMultiplyFunctor { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z); +}; + +template +struct SameDimsMultiplyFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto blas = paddle::operators::math::GetBlas(dev_ctx); + blas.VMUL(x.numel(), x.data(), y.data(), z->mutable_data()); + } +}; + +template +struct SameDimsMultiplyFunctor< + DevCtx, + T, + typename std::enable_if::value>::type> { + void operator()(const DevCtx& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + auto eigen_z = pten::EigenVector::Flatten(*z); + auto& place = *dev_ctx.eigen_device(); + eigen_z.device(place) = eigen_x * eigen_y; + } +}; + +inline void UpdateElementwiseIndexArray(const int* out_dims_array, + const int max_dim, + int* index_array) { + for (int i = max_dim - 1; i >= 0; --i) { + ++index_array[i]; + if (index_array[i] >= out_dims_array[i]) { + index_array[i] -= out_dims_array[i]; + } else { + break; + } + } +} + +inline int GetElementwiseIndex(const int* x_dims_array, + const int max_dim, + const int* index_array) { + int index_ = 0; + for (int i = 0; i < max_dim; i++) { + if (x_dims_array[i] > 1) { + index_ = index_ * x_dims_array[i] + index_array[i]; + } + } + return index_; +} + +template +void CommonForwardBroadcastCPU(const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z, + int* x_dims_array, + int* y_dims_array, + int* out_dims_array, + int max_dim, + const paddle::platform::CPUDeviceContext& ctx, + Functor func, + const bool is_xsize_larger = true) { + std::vector index_array(max_dim, 0); + const T* x_data = x.data(); + const T* y_data = y.data(); + PADDLE_ENFORCE_NOT_NULL(x_data, + paddle::platform::errors::InvalidArgument( + "The input X should not be empty.")); + PADDLE_ENFORCE_NOT_NULL(y_data, + paddle::platform::errors::InvalidArgument( + "The input Y should not be empty.")); + OutType* out_data = z->mutable_data(); + + const int out_size = std::accumulate( + out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); + int x_index, y_index; + for (int out_index = 0; out_index < out_size; ++out_index) { + x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); + y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); + if (is_xsize_larger) { + out_data[out_index] = func(x_data[x_index], y_data[y_index]); + } else { + out_data[out_index] = func(y_data[y_index], x_data[x_index]); + } + + UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); + } +} + +template +void CommonElementwiseBroadcastForward( + const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z, + const DDim& x_dims, + const DDim& y_dims, + Functor func, + int axis, + const bool is_xsize_larger = true) { + int max_dim = (std::max)(x_dims.size(), y_dims.size()); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + + CommonForwardBroadcastCPU(x, + y, + z, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + dev_ctx, + func, + is_xsize_larger); +} + +// It is a common CPU implementation to compute binary calculation with the +// support of broadcast. Note: +// 1. CPU implementation cannot support the case when x needs broadcast, thus +// this function need to be called with XxxFunctor and XxxInverseFunctor, +// like AddFunctor and InverseAddFunctor. +// 2. The corresponding GPU implementation supports all the broadcast cases, +// thus there is no need to define and call with XxxInverseFunctor. +// TODO(liuyiqun): optimize the CPU implementation to support all broadcast +// cases and avoid the need of XxxInverseFunctor. +template +void ElementwiseCompute(const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + Functor func, + DenseTensor* z) { + z->mutable_data(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + } + funcs:: + TransformFunctor + functor(x, y, z, dev_ctx, func, is_xsize_larger); + if (x_dims == y_dims) { + functor.Run(); + return; + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + PADDLE_ENFORCE_GE( + axis, + 0, + paddle::platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, + max_dim, + paddle::platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, + axis)); + + int pre, n, post, is_run_common_broadcast, axis_trim = 0; + if (is_xsize_larger) { + auto y_dims_trimed = funcs::trim_trailing_singular_dims(y_dims); + axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; + funcs::get_mid_dims(x_dims, + y_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } else { + auto x_dims_trimed = funcs::trim_trailing_singular_dims(x_dims); + axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; + funcs::get_mid_dims(y_dims, + x_dims_trimed, + axis_trim, + &pre, + &n, + &post, + &is_run_common_broadcast); + } + // special case for common implementation. + // case 1: x=[2,3,1,5], y=[2,1,4,1] + // case 2: x=[2,3,4], y=[1,1,4] + if (is_run_common_broadcast == 1) { + CommonElementwiseBroadcastForward( + dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); + return; + } + + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } +} + +template +struct SameDimsElementwiseCompute { + void operator()(const paddle::platform::CPUDeviceContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* z) { + Functor()(dev_ctx, x, y, z); + } +}; + +} // namespace pten diff --git a/paddle/pten/kernels/cpu/math_kernel.cc b/paddle/pten/kernels/cpu/math_kernel.cc index 152d945144f6c..c022dd08bbe40 100644 --- a/paddle/pten/kernels/cpu/math_kernel.cc +++ b/paddle/pten/kernels/cpu/math_kernel.cc @@ -18,9 +18,11 @@ #include "paddle/pten/backends/cpu/cpu_context.h" #include "paddle/pten/common/scalar.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/kernels/hybird/cpu/elementwise.h" + +#include "paddle/pten/kernels/cpu/elementwise_impl.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" + #include "paddle/pten/kernels/hybird/eigen/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" #include "paddle/pten/kernels/hybird/general/reduce_impl.h" // See Note [ Why still include the fluid headers? ] @@ -30,29 +32,28 @@ namespace pten { -#define DEFINE_CPU_ELEMENTWISE_OP(name) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - out->mutable_data(); \ - if (x.dims() == y.dims()) { \ - SameDimsElementwiseCompute< \ - general::SameDims##name##Functor>()( \ - dev_ctx, x, y, out); \ - } else { \ - auto x_dims = x.dims(); \ - auto y_dims = y.dims(); \ - if (x_dims.size() >= y_dims.size()) { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::name##Functor(), out); \ - } else { \ - ElementwiseCompute, T>( \ - dev_ctx, x, y, axis, general::Inverse##name##Functor(), out); \ - } \ - } \ +#define DEFINE_CPU_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + out->mutable_data(); \ + if (x.dims() == y.dims()) { \ + SameDimsElementwiseCompute>()( \ + dev_ctx, x, y, out); \ + } else { \ + auto x_dims = x.dims(); \ + auto y_dims = y.dims(); \ + if (x_dims.size() >= y_dims.size()) { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::name##Functor(), out); \ + } else { \ + ElementwiseCompute, T>( \ + dev_ctx, x, y, axis, funcs::Inverse##name##Functor(), out); \ + } \ + } \ } template @@ -76,17 +77,17 @@ void DivideKernel(const Context& dev_ctx, // allocate memory for out out->mutable_data(); if (x.dims() == y.dims() && std::is_floating_point::value) { - SameDimsElementwiseCompute>()( + SameDimsElementwiseCompute>()( dev_ctx, x, y, out); } else { auto x_dims = x.dims(); auto y_dims = y.dims(); if (x_dims.size() >= y_dims.size()) { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::DivideFunctor(), out); + ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::DivideFunctor(), out); } else { - ElementwiseCompute, T>( - dev_ctx, x, y, axis, general::InverseDivideFunctor(), out); + ElementwiseCompute, T>( + dev_ctx, x, y, axis, funcs::InverseDivideFunctor(), out); } } } diff --git a/paddle/pten/kernels/hybird/general/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h similarity index 99% rename from paddle/pten/kernels/hybird/general/elementwise_base.h rename to paddle/pten/kernels/funcs/elementwise_base.h index 20154a8744f3d..a0c6d5ba57011 100644 --- a/paddle/pten/kernels/hybird/general/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/pten/core/dense_tensor.h" namespace pten { -namespace general { +namespace funcs { using DDim = paddle::framework::DDim; @@ -378,6 +378,5 @@ inline void GetBroadcastDimsArrays(const DDim &x_dims, } } } - -} // namespace general +} // namespace funcs } // namespace pten diff --git a/paddle/pten/kernels/funcs/elementwise_functor.h b/paddle/pten/kernels/funcs/elementwise_functor.h new file mode 100644 index 0000000000000..9b2519b0fd6b1 --- /dev/null +++ b/paddle/pten/kernels/funcs/elementwise_functor.h @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace pten { +namespace funcs { + +// Define the binary functors used in elementwise ops. + +// Add +template +struct AddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } +}; +template +struct InverseAddFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } +}; + +// Subtract +template +struct SubtractFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; +template +struct InverseSubtractFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } +}; + +// Multiply +template +struct MultiplyFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } +}; +template +struct InverseMultiplyFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } +}; + +// Divide +#define DIV_ERROR_INFO \ + "InvalidArgumentError: Integer division by zero encountered in " \ + "(floor) divide. Please check the input value." + +template +struct DivideFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } +}; + +template +struct DivideFunctor< + T, + typename std::enable_if::value>::type> { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); + return a / b; + } +}; + +template +struct InverseDivideFunctor { + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } +}; + +} // namespace funcs +} // namespace pten diff --git a/paddle/pten/kernels/gpu/math_kernel.cu b/paddle/pten/kernels/gpu/math_kernel.cu index 636d0f16b0d71..760bebe687841 100644 --- a/paddle/pten/kernels/gpu/math_kernel.cu +++ b/paddle/pten/kernels/gpu/math_kernel.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/kernels/funcs/elementwise_functor.h" #include "paddle/pten/kernels/hybird/cuda/elementwise/elementwise.h" #include "paddle/pten/kernels/hybird/cuda/reduce/reduce.h" -#include "paddle/pten/kernels/hybird/general/elementwise_functor.h" #include "paddle/pten/kernels/hybird/general/reduce_impl.h" #ifdef __NVCC__ @@ -39,21 +39,21 @@ namespace kps = paddle::operators::kernel_primitives; namespace pten { -#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ - template \ - void name##Kernel(const Context& dev_ctx, \ - const DenseTensor& x, \ - const DenseTensor& y, \ - int axis, \ - DenseTensor* out) { \ - std::vector inputs; \ - std::vector outputs; \ - inputs.emplace_back(&x); \ - inputs.emplace_back(&y); \ - outputs.emplace_back(out); \ - out->mutable_data(); \ - LaunchElementwiseCudaKernel( \ - dev_ctx, inputs, &outputs, axis, general::name##Functor()); \ +#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ + template \ + void name##Kernel(const Context& dev_ctx, \ + const DenseTensor& x, \ + const DenseTensor& y, \ + int axis, \ + DenseTensor* out) { \ + std::vector inputs; \ + std::vector outputs; \ + inputs.emplace_back(&x); \ + inputs.emplace_back(&y); \ + outputs.emplace_back(out); \ + out->mutable_data(); \ + LaunchElementwiseCudaKernel( \ + dev_ctx, inputs, &outputs, axis, funcs::name##Functor()); \ } /** diff --git a/paddle/pten/kernels/hybird/CMakeLists.txt b/paddle/pten/kernels/hybird/CMakeLists.txt index 1304aa1798c0c..5d04bae2eae82 100644 --- a/paddle/pten/kernels/hybird/CMakeLists.txt +++ b/paddle/pten/kernels/hybird/CMakeLists.txt @@ -1,5 +1,4 @@ add_subdirectory(eigen) -add_subdirectory(blas) add_subdirectory(general) cc_library(pten_transpose_cpu SRCS transpose.cc DEPS dense_tensor pten_context) diff --git a/paddle/pten/kernels/hybird/blas/CMakeLists.txt b/paddle/pten/kernels/hybird/blas/CMakeLists.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/paddle/pten/kernels/hybird/blas/elementwise.h b/paddle/pten/kernels/hybird/blas/elementwise.h deleted file mode 100644 index 1a530c9f8e940..0000000000000 --- a/paddle/pten/kernels/hybird/blas/elementwise.h +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/operators/math/blas.h" -#include "paddle/pten/core/dense_tensor.h" - -namespace pten { -namespace blas { - -template -void ElementwiseAdd(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VADD(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseSub(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VSUB(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseDiv(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VDIV(x.numel(), x.data(), y.data(), out->mutable_data()); -} - -template -void ElementwiseMul(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto blas = paddle::operators::math::GetBlas(dev_ctx); - blas.VMUL(x.numel(), x.data(), y.data(), out->mutable_data()); -} -} // namespace blas -} // namespace pten diff --git a/paddle/pten/kernels/hybird/cpu/CMakeLists.txt b/paddle/pten/kernels/hybird/cpu/CMakeLists.txt deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/paddle/pten/kernels/hybird/cpu/elementwise.h b/paddle/pten/kernels/hybird/cpu/elementwise.h deleted file mode 100644 index d503957a76262..0000000000000 --- a/paddle/pten/kernels/hybird/cpu/elementwise.h +++ /dev/null @@ -1,230 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" - -namespace pten { - -inline void UpdateElementwiseIndexArray(const int *out_dims_array, - const int max_dim, - int *index_array) { - for (int i = max_dim - 1; i >= 0; --i) { - ++index_array[i]; - if (index_array[i] >= out_dims_array[i]) { - index_array[i] -= out_dims_array[i]; - } else { - break; - } - } -} - -inline int GetElementwiseIndex(const int *x_dims_array, - const int max_dim, - const int *index_array) { - int index_ = 0; - for (int i = 0; i < max_dim; i++) { - if (x_dims_array[i] > 1) { - index_ = index_ * x_dims_array[i] + index_array[i]; - } - } - return index_; -} - -template -void CommonForwardBroadcastCPU(const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z, - int *x_dims_array, - int *y_dims_array, - int *out_dims_array, - int max_dim, - const paddle::platform::CPUDeviceContext &ctx, - Functor func, - const bool is_xsize_larger = true) { - std::vector index_array(max_dim, 0); - const T *x_data = x.data(); - const T *y_data = y.data(); - PADDLE_ENFORCE_NOT_NULL(x_data, - paddle::platform::errors::InvalidArgument( - "The input X should not be empty.")); - PADDLE_ENFORCE_NOT_NULL(y_data, - paddle::platform::errors::InvalidArgument( - "The input Y should not be empty.")); - OutType *out_data = z->mutable_data(); - - const int out_size = std::accumulate( - out_dims_array, out_dims_array + max_dim, 1, std::multiplies()); - int x_index, y_index; - for (int out_index = 0; out_index < out_size; ++out_index) { - x_index = GetElementwiseIndex(x_dims_array, max_dim, index_array.data()); - y_index = GetElementwiseIndex(y_dims_array, max_dim, index_array.data()); - if (is_xsize_larger) { - out_data[out_index] = func(x_data[x_index], y_data[y_index]); - } else { - out_data[out_index] = func(y_data[y_index], x_data[x_index]); - } - - UpdateElementwiseIndexArray(out_dims_array, max_dim, index_array.data()); - } -} - -template -void CommonElementwiseBroadcastForward( - const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z, - const DDim &x_dims, - const DDim &y_dims, - Functor func, - int axis, - const bool is_xsize_larger = true) { - int max_dim = (std::max)(x_dims.size(), y_dims.size()); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - paddle::platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - paddle::platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - std::vector x_dims_array(max_dim); - std::vector y_dims_array(max_dim); - std::vector out_dims_array(max_dim); - general::GetBroadcastDimsArrays(x_dims, - y_dims, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - axis); - - CommonForwardBroadcastCPU(x, - y, - z, - x_dims_array.data(), - y_dims_array.data(), - out_dims_array.data(), - max_dim, - dev_ctx, - func, - is_xsize_larger); -} - -// It is a common CPU implementation to compute binary calculation with the -// support of broadcast. Note: -// 1. CPU implementation cannot support the case when x needs broadcast, thus -// this function need to be called with XxxFunctor and XxxInverseFunctor, -// like AddFunctor and InverseAddFunctor. -// 2. The corresponding GPU implementation supports all the broadcast cases, -// thus there is no need to define and call with XxxInverseFunctor. -// TODO(liuyiqun): optimize the CPU implementation to support all broadcast -// cases and avoid the need of XxxInverseFunctor. -template -void ElementwiseCompute(const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - int axis, - Functor func, - DenseTensor *z) { - z->mutable_data(); - auto x_dims = x.dims(); - auto y_dims = y.dims(); - bool is_xsize_larger = true; - int max_dim = x_dims.size(); - if (x_dims.size() < y_dims.size()) { - is_xsize_larger = false; - max_dim = y_dims.size(); - } - general:: - TransformFunctor - functor(x, y, z, dev_ctx, func, is_xsize_larger); - if (x_dims == y_dims) { - functor.Run(); - return; - } - - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - PADDLE_ENFORCE_GE( - axis, - 0, - paddle::platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT(axis, - max_dim, - paddle::platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", - max_dim, - axis)); - - int pre, n, post, is_run_common_broadcast, axis_trim = 0; - if (is_xsize_larger) { - auto y_dims_trimed = general::trim_trailing_singular_dims(y_dims); - axis_trim = (y_dims_trimed.size() == 0) ? x_dims.size() : axis; - general::get_mid_dims(x_dims, - y_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } else { - auto x_dims_trimed = general::trim_trailing_singular_dims(x_dims); - axis_trim = (x_dims_trimed.size() == 0) ? y_dims.size() : axis; - general::get_mid_dims(y_dims, - x_dims_trimed, - axis_trim, - &pre, - &n, - &post, - &is_run_common_broadcast); - } - // special case for common implementation. - // case 1: x=[2,3,1,5], y=[2,1,4,1] - // case 2: x=[2,3,4], y=[1,1,4] - if (is_run_common_broadcast == 1) { - CommonElementwiseBroadcastForward( - dev_ctx, x, y, z, x_dims, y_dims, func, axis, is_xsize_larger); - return; - } - - if (post == 1) { - functor.RunRowWise(n, pre); - return; - } else { - functor.RunMidWise(n, pre, post); - return; - } -} - -template -struct SameDimsElementwiseCompute { - void operator()(const paddle::platform::CPUDeviceContext &dev_ctx, - const DenseTensor &x, - const DenseTensor &y, - DenseTensor *z) { - Functor()(dev_ctx, x, y, z); - } -}; - -} // namespace pten diff --git a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h index 7c5f3a9778404..ae384693249a4 100644 --- a/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h +++ b/paddle/pten/kernels/hybird/cuda/elementwise/elementwise_common.cu.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/function_traits.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/general/elementwise_base.h" +#include "paddle/pten/kernels/funcs/elementwise_base.h" namespace pten { namespace kps = paddle::operators::kernel_primitives; diff --git a/paddle/pten/kernels/hybird/eigen/elementwise.h b/paddle/pten/kernels/hybird/eigen/elementwise.h deleted file mode 100644 index e67cce63d461f..0000000000000 --- a/paddle/pten/kernels/hybird/eigen/elementwise.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/eigen/common.h" - -namespace pten { -namespace eigen { - -template -void ElementwiseAdd(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - out->mutable_data(); - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x + eigen_y; -} - -template -void ElementwiseSub(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x - eigen_y; -} - -template -void ElementwiseMul(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - auto eigen_z = pten::EigenVector::Flatten(*out); - auto& place = *dev_ctx.eigen_device(); - eigen_z.device(place) = eigen_x * eigen_y; -} - -} // namespace eigen -} // namespace pten diff --git a/paddle/pten/kernels/hybird/general/elementwise_functor.h b/paddle/pten/kernels/hybird/general/elementwise_functor.h deleted file mode 100644 index 62b422f4ae414..0000000000000 --- a/paddle/pten/kernels/hybird/general/elementwise_functor.h +++ /dev/null @@ -1,223 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/fluid/platform/hostdevice.h" -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/blas/elementwise.h" -#include "paddle/pten/kernels/hybird/eigen/elementwise.h" - -namespace pten { -namespace general { - -// Define the binary functors used in elementwise ops. - -// Add -template -struct SameDimsAddFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseAdd(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsAddFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseAdd(dev_ctx, x, y, z); - } -}; - -template -struct AddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a + b; } -}; -template -struct InverseAddFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b + a; } -}; - -// Subtract -template -struct SameDimsSubtractFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseSub(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsSubtractFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseSub(dev_ctx, x, y, z); - } -}; - -template -struct SubtractFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } -}; -template -struct InverseSubtractFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b - a; } -}; - -// Divide -template -struct SameDimsDivideFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - paddle::platform::errors::InvalidArgument( - "If use SameDimsDivideFunctor, template args(T) must be floating " - "point. "); - } -}; - -template -struct SameDimsDivideFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseDiv(dev_ctx, x, y, z); - } -}; - -#define DIV_ERROR_INFO \ - "InvalidArgumentError: Integer division by zero encountered in " \ - "(floor) divide. Please check the input value." - -template -struct DivideFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } -}; - -template -struct DivideFunctor< - T, - typename std::enable_if::value>::type> { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO); - return a / b; - } -}; - -template -struct InverseDivideFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b / a; } -}; - -// Multiply -template -struct SameDimsMultiplyFunctor { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z); -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - blas::ElementwiseMul(dev_ctx, x, y, z); - } -}; - -template -struct SameDimsMultiplyFunctor< - DevCtx, - T, - typename std::enable_if::value>::type> { - void operator()(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* z) { - eigen::ElementwiseMul(dev_ctx, x, y, z); - } -}; -template -struct MultiplyFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a * b; } -}; -template -struct InverseMultiplyFunctor { - inline HOSTDEVICE T operator()(const T& a, const T& b) const { return b * a; } -}; - -} // namespace general -} // namespace pten diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 72bf26c57dd5a..35720ae32fe38 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -342,7 +342,6 @@ def source_include(header_file_path): #include "paddle/pten/api/include/kernel_signature.h" #include "paddle/pten/api/lib/api_registry.h" -#include "paddle/pten/api/lib/kernel_declare.h" #include "paddle/pten/api/lib/kernel_dispatch.h" #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/core/kernel_registry.h"