From b9675acc9d4326b73f5b3167265a1d3f6e98dac9 Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Wed, 23 Feb 2022 16:55:49 +0800 Subject: [PATCH 01/85] change CUDA implementaion of bernoulli OP (#39732) * change CUDA implementaion of bernoulli OP * fix CI --- paddle/fluid/operators/distribution_helper.h | 9 +- paddle/phi/backends/gpu/gpu_launch_config.h | 1 + paddle/phi/kernels/gpu/bernoulli_kernel.cu | 82 +++++++++++++++---- .../tests/unittests/test_bernoulli_op.py | 39 +++++++++ .../tests/unittests/test_exponential_op.py | 11 +-- .../unittests/test_gaussian_random_op.py | 9 +- .../fluid/tests/unittests/test_poisson_op.py | 7 +- .../tests/unittests/test_uniform_random_op.py | 9 +- 8 files changed, 135 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/operators/distribution_helper.h b/paddle/fluid/operators/distribution_helper.h index ca6bcb1147a2f..c13bf687af234 100644 --- a/paddle/fluid/operators/distribution_helper.h +++ b/paddle/fluid/operators/distribution_helper.h @@ -180,8 +180,8 @@ struct normal_distribution { /******** Launch GPU function of distribution and transformation *********/ template __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, - DistOp dist, TransformOp trans, - T *out_data) { + DistOp dist, TransformOp trans, T *out_data, + size_t stride) { size_t idx = static_cast(BLOCK_ID_X * BLOCK_NUM_X); static constexpr int kCount = DistOp::kReturnsCount; #if defined(__NVCC__) @@ -201,7 +201,8 @@ __global__ void DistributionKernel(size_t size, uint64_t seed, uint64_t offset, kps::ElementwiseUnary(&result[0], &args[0], trans); kps::WriteData(out_data + i, &result[0], size - i, - 1, total_thread, 1); + 1, stride, 1); + __syncthreads(); } } @@ -234,7 +235,7 @@ void distribution_and_transform(const platform::CUDADeviceContext &dev_ctx, DistributionKernel< T, DistOp, TransformOp><<>>( - size, seed, offset, dist, trans, out_data); + size, seed, offset, dist, trans, out_data, total_thread); } #endif diff --git a/paddle/phi/backends/gpu/gpu_launch_config.h b/paddle/phi/backends/gpu/gpu_launch_config.h index 5aa569e0197bd..e45b465122588 100644 --- a/paddle/phi/backends/gpu/gpu_launch_config.h +++ b/paddle/phi/backends/gpu/gpu_launch_config.h @@ -29,6 +29,7 @@ #include #include #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/enforce.h" #ifdef __HIPCC__ // HIP results in error or nan if > 256 diff --git a/paddle/phi/kernels/gpu/bernoulli_kernel.cu b/paddle/phi/kernels/gpu/bernoulli_kernel.cu index 6127bceef509c..ac69d398b8ac4 100644 --- a/paddle/phi/kernels/gpu/bernoulli_kernel.cu +++ b/paddle/phi/kernels/gpu/bernoulli_kernel.cu @@ -12,19 +12,30 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include #include #include +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +#endif + #include #include + #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/bernoulli_kernel.h" // See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/operators/distribution_helper.h" #include "paddle/fluid/platform/transform.h" +DECLARE_bool(use_curand); + namespace phi { template @@ -49,26 +60,69 @@ struct BernoulliCudaFunctor { } }; +// 'curand_uniform4/hiprand_uniform4' generate 4 random number each time +template +__global__ void bernoulli_cuda_kernel( + size_t size, uint64_t seed, uint64_t offset, const T* x_data, T* out_data) { + size_t thread_idx = + static_cast(blockIdx.x * blockDim.x + threadIdx.x); + +#if defined(__NVCC__) + curandStatePhilox4_32_10_t state; + curand_init(seed, thread_idx, offset, &state); +#else + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed, thread_idx, offset, &state); +#endif + + size_t total_thread = gridDim.x * blockDim.x; + for (size_t i = 4 * thread_idx; i < size; i += total_thread * 4) { + paddle::distribution::uniform_distribution dist; + float4 rand = dist(&state); +#pragma unroll + for (size_t j = 0; j < 4; j++) { + size_t idx = i + j; + if (idx < size) { + out_data[idx] = static_cast((&rand.x)[j] <= x_data[idx]); + } + } + } +} + template void BernoulliKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { - auto numel = x.numel(); - auto* x_data = x.data(); + const T* x_data = x.data(); T* out_data = ctx.template Alloc(out); + auto numel = x.numel(); auto gen_cuda = ctx.GetGenerator(); - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = numel * seed_offset.second; - paddle::platform::Transform trans; - thrust::counting_iterator index_sequence_begin(0); - trans(ctx, - index_sequence_begin, - index_sequence_begin + numel, - x_data, - out_data, - BernoulliCudaFunctor(static_cast(seed_offset.first), - static_cast(gen_offset))); + + if (FLAGS_use_curand) { + auto seed_offset = gen_cuda->IncrementOffset(12); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + auto gpu_config = phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, 4); + size_t grid_size = gpu_config.GetGridSize(); + size_t block_size = gpu_config.GetBlockSize(); + + bernoulli_cuda_kernel<<>>( + numel, seed, offset, x_data, out_data); + } else { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = numel * seed_offset.second; + paddle::platform::Transform trans; + thrust::counting_iterator index_sequence_begin(0); + trans(ctx, + index_sequence_begin, + index_sequence_begin + numel, + x_data, + out_data, + BernoulliCudaFunctor(static_cast(seed_offset.first), + static_cast(gen_offset))); + } } } // namespace phi diff --git a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py index 471caeb77bf65..426d5d463f453 100644 --- a/python/paddle/fluid/tests/unittests/test_bernoulli_op.py +++ b/python/paddle/fluid/tests/unittests/test_bernoulli_op.py @@ -18,6 +18,7 @@ import paddle from op_test import OpTest import numpy as np +import os def output_hist(out): @@ -68,5 +69,43 @@ def test_static(self): hist, prob, rtol=0, atol=0.01), "hist: " + str(hist)) +class TestRandomValue(unittest.TestCase): + def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' + if not paddle.is_compiled_with_cuda(): + return + + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") + paddle.disable_static() + paddle.set_device('gpu') + paddle.seed(100) + np.random.seed(100) + + x_np = np.random.rand(32, 1024, 1024) + + x = paddle.to_tensor(x_np, dtype='float64') + y = paddle.bernoulli(x).numpy() + index0, index1, index2 = np.nonzero(y) + self.assertEqual(np.sum(index0), 260028995) + self.assertEqual(np.sum(index1), 8582429431) + self.assertEqual(np.sum(index2), 8581445798) + expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.] + self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) + + x = paddle.to_tensor(x_np, dtype='float32') + y = paddle.bernoulli(x).numpy() + index0, index1, index2 = np.nonzero(y) + self.assertEqual(np.sum(index0), 260092343) + self.assertEqual(np.sum(index1), 8583509076) + self.assertEqual(np.sum(index2), 8582778540) + expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.] + self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) + + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_exponential_op.py b/python/paddle/fluid/tests/unittests/test_exponential_op.py index 7d43ebadf41bb..ccbc0a1676302 100644 --- a/python/paddle/fluid/tests/unittests/test_exponential_op.py +++ b/python/paddle/fluid/tests/unittests/test_exponential_op.py @@ -16,6 +16,7 @@ import paddle import numpy as np from op_test import OpTest +import os paddle.enable_static() paddle.seed(100) @@ -90,18 +91,18 @@ def test_dygraph(self): self.assertTrue(np.min(x.numpy()) >= 0) paddle.enable_static() - # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' if not paddle.is_compiled_with_cuda(): return - # Note(zhouwei): The Number of threads is determined by - # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different - # GPU have different number of threads, which result in different - # random value. Only test on V100 GPU here. + # Different GPU generatte different random value. Only test V100 here. if not "V100" in paddle.device.cuda.get_device_name(): return + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + print("Test Fixed Random number on V100 GPU------>") paddle.disable_static() paddle.set_device('gpu') diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py index 43bcc3438eef4..31caf4bd6be98 100644 --- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py @@ -14,6 +14,7 @@ from __future__ import print_function +import os import unittest import numpy as np import paddle @@ -293,13 +294,13 @@ def test_fixed_random_number(self): if not paddle.is_compiled_with_cuda(): return - # Note(zhouwei): The Number of threads is determined by - # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different - # GPU have different number of threads, which result in different - # random value. Only test on V100 GPU here. + # Different GPU generatte different random value. Only test V100 here. if not "V100" in paddle.device.cuda.get_device_name(): return + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + def _check_random_value(dtype, expect, expect_mean, expect_std): x = paddle.randn([32, 3, 1024, 1024], dtype=dtype) actual = x.numpy() diff --git a/python/paddle/fluid/tests/unittests/test_poisson_op.py b/python/paddle/fluid/tests/unittests/test_poisson_op.py index dc4dc3284e923..2123d4e0e7e35 100644 --- a/python/paddle/fluid/tests/unittests/test_poisson_op.py +++ b/python/paddle/fluid/tests/unittests/test_poisson_op.py @@ -17,6 +17,7 @@ import numpy as np from op_test import OpTest import math +import os paddle.enable_static() paddle.seed(100) @@ -101,11 +102,15 @@ def test_dygraph(self): self.assertTrue(np.min(y.numpy()) >= 0) paddle.enable_static() - # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' def test_fixed_random_number(self): + # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' if not paddle.is_compiled_with_cuda(): return + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + + print("Test Fixed Random number on GPU------>") paddle.disable_static() paddle.set_device('gpu') paddle.seed(2021) diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py index a84c3b20da26c..41b6ed36d65cc 100644 --- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py +++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py @@ -15,6 +15,7 @@ from __future__ import print_function import sys +import os import subprocess import unittest import numpy as np @@ -568,13 +569,13 @@ def test_fixed_random_number(self): if not paddle.is_compiled_with_cuda(): return - # Note(zhouwei): The Number of threads is determined by - # 'multiProcessorCount * maxThreadsPerMultiProcessor'. So, different - # GPU have different number of threads, which result in different - # random value. Only test on V100 GPU here. + # Different GPU generate different random value. Only test V100 here. if not "V100" in paddle.device.cuda.get_device_name(): return + if os.getenv("FLAGS_use_curand", None) in ('0', 'False', None): + return + def _check_random_value(dtype, expect, expect_mean, expect_std): x = paddle.rand([32, 3, 1024, 1024], dtype=dtype) actual = x.numpy() From 6241913b8bf3f6259a38cad29ea8ba9cd598ff4a Mon Sep 17 00:00:00 2001 From: maxhuiy <1508399706@qq.com> Date: Wed, 23 Feb 2022 17:33:17 +0800 Subject: [PATCH 02/85] [MLU] add cncl parallel context and mlu resource pool (#39803) * [MLU] add cncl parallel context and mlu resource pool * [MLU] fix the cncl_context_test --- paddle/fluid/imperative/CMakeLists.txt | 3 + paddle/fluid/imperative/cncl_context.cc | 237 ++++++++++++++++++ paddle/fluid/imperative/cncl_context.h | 75 ++++++ paddle/fluid/imperative/tests/CMakeLists.txt | 3 + .../imperative/tests/cncl_context_test.cc | 141 +++++++++++ paddle/fluid/platform/CMakeLists.txt | 4 + .../fluid/platform/device/mlu/CMakeLists.txt | 1 + .../platform/device/mlu/mlu_resource_pool.cc | 99 ++++++++ .../platform/device/mlu/mlu_resource_pool.h | 64 +++++ paddle/fluid/pybind/CMakeLists.txt | 8 + paddle/fluid/pybind/imperative.cc | 13 + 11 files changed, 648 insertions(+) create mode 100644 paddle/fluid/imperative/cncl_context.cc create mode 100644 paddle/fluid/imperative/cncl_context.h create mode 100644 paddle/fluid/imperative/tests/cncl_context_test.cc create mode 100644 paddle/fluid/platform/device/mlu/mlu_resource_pool.cc create mode 100644 paddle/fluid/platform/device/mlu/mlu_resource_pool.h diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 90cf0e76e0007..72f7e5af9a96e 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -31,6 +31,9 @@ if(NOT WIN32) cc_library(hccl_context SRCS hccl_context.cc DEPS collective_helper device_context tensor var_type_traits) cc_library(reducer SRCS reducer.cc DEPS layer) endif() + if(WITH_CNCL) + cc_library(cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits) + endif() if(WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL) cc_library(heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits) endif() diff --git a/paddle/fluid/imperative/cncl_context.cc b/paddle/fluid/imperative/cncl_context.cc new file mode 100644 index 0000000000000..779b748c2d2d4 --- /dev/null +++ b/paddle/fluid/imperative/cncl_context.cc @@ -0,0 +1,237 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/imperative/cncl_context.h" + +#include "paddle/fluid/framework/convert_utils.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/variable.h" + +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" + +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#include "paddle/fluid/platform/device/mlu/mlu_info.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +static void AllReduce(const framework::Tensor &src, framework::Tensor *dst, + const mluStream stream, const platform::CNCLComm *comm) { + const auto &place = src.place(); + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(place), true, + platform::errors::Unimplemented( + "Imperative mode does not support multi-CPU training yet.")); + + const void *src_ptr = src.data(); + dst->Resize(src.dims()); + auto *dst_ptr = dst->mutable_data(src.place(), src.dtype()); + auto cncl_dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(src.dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(src_ptr, dst_ptr, src.numel(), + cncl_dtype, cnclSum, comm->comm(), + stream)); +} + +void CNCLParallelContext::BcastCNCLId( + std::vector &cncl_ids, // NOLINT + int root, int server_fd) { + if (strategy_.local_rank_ == root) { + std::vector other_trainers; + for (auto &ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) { + other_trainers.push_back(ep); + } + } + platform::SendBroadCastCommID(other_trainers, &cncl_ids); + } else { + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &cncl_ids); + } +} + +void CNCLParallelContext::Init() { + int server_fd = -1; + + std::vector cncl_ids; + cncl_ids.resize(strategy_.nrings_); + + if (strategy_.local_rank_ == 0) { + // generate the unique cnclid on the root worker + for (size_t i = 0; i < cncl_ids.size(); ++i) { + PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[i])); + } + } else { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastCNCLId(cncl_ids, 0, server_fd); + + int mlu_id = place_.device; + for (int ring_id = 0; ring_id < strategy_.nrings_; ++ring_id) { + VLOG(0) << "init cncl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id + << " ring id: " << ring_id; + // it will assign cncl_comm in MLUDeviceContext within ring_id + platform::CNCLCommContext::Instance().CreateComm( + &cncl_ids[ring_id], strategy_.nranks_, strategy_.local_rank_, mlu_id, + ring_id); + + compute_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + } +} + +void CNCLParallelContext::InitWithRingID(int ring_id) { + int server_fd = -1; + std::vector cncl_ids; + cncl_ids.resize(1); + + if (strategy_.local_rank_ == 0) { + // generate the unique cnclid on the root worker + PADDLE_ENFORCE_MLU_SUCCESS(cnclGetCliqueId(&cncl_ids[0])); + } else { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastCNCLId(cncl_ids, 0, server_fd); + + int mlu_id = place_.device; + VLOG(0) << "init cncl context nranks: " << strategy_.nranks_ + << " local rank: " << strategy_.local_rank_ << " mlu id: " << mlu_id + << " ring id: " << ring_id; + // it will assign cncl_comm in MLUDeviceContext within ring_id + platform::CNCLCommContext::Instance().CreateComm( + &cncl_ids[0], strategy_.nranks_, strategy_.local_rank_, mlu_id, ring_id); + + compute_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); + comm_events_.emplace_back( + platform::MluEventResourcePool::Instance().New(place_.device)); +} + +void CNCLParallelContext::AllReduceByStream(const framework::Variable &src, + framework::Variable *dst, + int ring_id, bool use_calc_stream) { + PADDLE_ENFORCE_EQ( + platform::is_mlu_place(place_), true, + platform::errors::Unimplemented( + "Dynamic graph mode does not support multi-CPU training yet.")); + auto *dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + platform::CNCLComm *comm = + platform::CNCLCommContext::Instance().Get(ring_id, place_); + mluStream stream = (use_calc_stream ? dev_ctx->stream() : comm->stream()); + + if (src.IsType()) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable(), stream, comm); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor is supported.", + platform::demangle(framework::ToTypeName(src.Type())))); + } +} + +void CNCLParallelContext::Broadcast(framework::Variable *src, int ring_id) { + VLOG(3) << "/// DEBUG /// start inter broadcast with ring_id: " << ring_id; + framework::Tensor *src_tensor = src->GetMutable(); + const auto &place = src_tensor->place(); + platform::CNCLComm *comm = + platform::CNCLCommContext::Instance().Get(ring_id, place); + mluStream stream = comm->stream(); + + void *src_ptr = src_tensor->data(); + auto cncl_dtype = platform::ToCNCLDataType( + framework::TransToProtoVarType(src_tensor->dtype())); + PADDLE_ENFORCE_MLU_SUCCESS(cnclBcast(src_ptr, src_tensor->numel(), cncl_dtype, + 0, comm->comm(), stream)); +} + +paddle::platform::DeviceContext *CNCLParallelContext::GetDeviceContext( + int ring_id) { + return static_cast( + platform::CNCLCommContext::Instance() + .Get(ring_id, place_) + ->dev_context()); +} + +void CNCLParallelContext::WaitCompute(int ring_id) { + PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( + "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_LT(ring_id, compute_events_.size(), + platform::errors::OutOfRange( + "ring id must < compute events size," + "but got ring id = %d, compute events size = %d", + ring_id, compute_events_.size())); + + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto comm_stream = + platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream(); + auto event = compute_events_[ring_id].get(); + + // compute_stream-->event-->comm_stream + PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, compute_stream)); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, comm_stream, 0)); +} + +void CNCLParallelContext::WaitComm(int ring_id) { + PADDLE_ENFORCE_GE(ring_id, 0, platform::errors::OutOfRange( + "ring id must >= 0, but got %d", ring_id)); + PADDLE_ENFORCE_LT(ring_id, comm_events_.size(), + platform::errors::OutOfRange( + "ring id must < comm events size," + "but got ring id = %d, comm events size = %d", + ring_id, comm_events_.size())); + + auto compute_stream = static_cast( + platform::DeviceContextPool::Instance().Get(place_)) + ->stream(); + auto comm_stream = + platform::CNCLCommContext::Instance().Get(ring_id, place_)->stream(); + auto event = comm_events_[ring_id].get(); + + // comm_stream-->event-->compute_stream + PADDLE_ENFORCE_MLU_SUCCESS(cnrtPlaceNotifier(event, comm_stream)); + PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueWaitNotifier(event, compute_stream, 0)); +} + +void CNCLParallelContext::SynchronizeCompute() { + auto *compute_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place_)); + compute_dev_ctx->Wait(); +} + +} // namespace imperative +} // namespace paddle + +#endif diff --git a/paddle/fluid/imperative/cncl_context.h b/paddle/fluid/imperative/cncl_context.h new file mode 100644 index 0000000000000..85f53319bfcde --- /dev/null +++ b/paddle/fluid/imperative/cncl_context.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#if defined(PADDLE_WITH_CNCL) +#include + +#include +#include +#include + +#include "paddle/fluid/imperative/parallel_context.h" +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" + +namespace paddle { +namespace framework { +class Variable; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace imperative { + +class CNCLParallelContext : public ParallelContext { + public: + explicit CNCLParallelContext(const ParallelStrategy& strategy, + const platform::Place& place) + : ParallelContext(strategy, place) {} + + ~CNCLParallelContext() override = default; + + void BcastCNCLId(std::vector& cncl_ids, int root, // NOLINT + int server_fd); + + void Init() override; + + void InitWithRingID(int ring_id) override; + + void AllReduceByStream(const framework::Variable& src, + framework::Variable* dst, int ring_id, + bool use_calc_stream) override; + + void Broadcast(framework::Variable* src, int ring_id) override; + + paddle::platform::DeviceContext* GetDeviceContext(int ring_id) override; + + void WaitCompute(int ring_id) override; + + void WaitComm(int ring_id) override; + + void SynchronizeCompute() override; + + private: + // used for comm wait compute, compute_stream-->event-->comm_stream[ring_id] + std::vector> compute_events_; + + // used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream + std::vector> comm_events_; +}; + +} // namespace imperative +} // namespace paddle +#endif diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index 774bb9653e2cb..a9c81cb87798b 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -9,6 +9,9 @@ else() if (WITH_XPU_BKCL) cc_test(bkcl_context_test SRCS bkcl_context_test.cc DEPS bkcl_context) endif() + if (WITH_CNCL) + cc_test(cncl_context_test SRCS cncl_context_test.cc DEPS cncl_context) + endif() endif(WIN32) diff --git a/paddle/fluid/imperative/tests/cncl_context_test.cc b/paddle/fluid/imperative/tests/cncl_context_test.cc new file mode 100644 index 0000000000000..1d5ee8e7fc899 --- /dev/null +++ b/paddle/fluid/imperative/tests/cncl_context_test.cc @@ -0,0 +1,141 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/imperative/cncl_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" + +#include "gtest/gtest.h" + +namespace imperative = paddle::imperative; +namespace platform = paddle::platform; +namespace framework = paddle::framework; + +// Node1: FLAGS_selected_mlus=0 PADDLE_TRAINER_ID=0 ./cncl_context_test +// Node2: FLAGS_selected_mlus=1 PADDLE_TRAINER_ID=1 ./cncl_context_test + +int nrings = 1; +imperative::ParallelStrategy GetStrategy(int local_rank) { + std::vector eps = {"127.0.0.1:9866", "localhost:9867"}; + imperative::ParallelStrategy strategy; + strategy.trainer_endpoints_ = eps; + strategy.current_endpoint_ = eps[local_rank]; + strategy.nranks_ = 2; + strategy.local_rank_ = local_rank; + strategy.nrings_ = nrings; + return strategy; +} + +#if defined(PADDLE_WITH_CNCL) +void Broadcast(int local_rank, int device_id) { + int data_size = 4; + float test_data = 7; + const auto& place = platform::MLUPlace(device_id); + platform::MLUDeviceContext ctx(place); + + imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place); + + // init + cpc.Init(); + + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // fill data for rank 0 only + std::vector src_vec; + if (local_rank == 0) { + for (int i = 0; i < data_size; ++i) { + src_vec.push_back(test_data); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + } + ctx.Wait(); + + // call broadcast + cpc.Broadcast(src_dev_var, 0); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*src_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + for (int i = 0; i < data_size; ++i) { + EXPECT_EQ(dst_vec[i], test_data); + } +} + +TEST(Broadcast, Run) { + if (platform::GetMLUDeviceCount() >= 2) { + int local_rank = atoi(getenv("PADDLE_TRAINER_ID")); + int device_id = atoi(getenv("FLAGS_selected_mlus")); + Broadcast(local_rank, device_id); + } +} + +void AllReduceByStream(int local_rank, int device_id) { + int data_size = 32; + const auto& place = platform::MLUPlace(device_id); + platform::MLUDeviceContext ctx(place); + + imperative::CNCLParallelContext cpc(GetStrategy(local_rank), place); + + // init + cpc.Init(); + + // input data + framework::Variable* src_dev_var(new framework::Variable()); + auto* src_dev_tensor = src_dev_var->GetMutable(); + src_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // fill input data + std::vector src_vec; + for (int i = 0; i < data_size; ++i) { + src_vec.push_back(1.0 + local_rank); + } + framework::TensorFromVector(src_vec, ctx, src_dev_tensor); + ctx.Wait(); + + // output data + framework::Variable* dst_dev_var(new framework::Variable()); + auto* dst_dev_tensor = dst_dev_var->GetMutable(); + dst_dev_tensor->mutable_data(phi::make_ddim({data_size}), place); + + // call allreduce + cpc.AllReduceByStream(*src_dev_var, dst_dev_var, 0, false); + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + + // check result + std::vector dst_vec; + framework::TensorToVector(*dst_dev_tensor, ctx, &dst_vec); + ctx.Wait(); + + EXPECT_EQ(dst_vec.size(), src_vec.size()); + for (int i = 0; i < data_size; ++i) { + EXPECT_EQ(dst_vec[i], 3.0); + } +} + +TEST(AllReduceByStream, Run) { + if (platform::GetMLUDeviceCount() >= 2) { + int local_rank = atoi(getenv("PADDLE_TRAINER_ID")); + int device_id = atoi(getenv("FLAGS_selected_mlus")); + AllReduceByStream(local_rank, device_id); + } +} +#endif diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 478b71745e4ac..37709c953e13b 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -148,6 +148,10 @@ if(WITH_ASCEND_CL) target_link_libraries(device_context npu_resource_pool) endif() +if(WITH_MLU) + target_link_libraries(device_context mlu_resource_pool) +endif() + if(WITH_CUSTOM_DEVICE) target_link_libraries(device_context custom_context) endif() diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt index 724776bfad233..1f3a7670849c2 100644 --- a/paddle/fluid/platform/device/mlu/CMakeLists.txt +++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt @@ -9,3 +9,4 @@ cc_library(mlu_stream SRCS mlu_stream.cc DEPS boost mlu_info stream_callback_man cc_library(mlu_device_context SRCS device_context.cc DEPS mlu_stream) cc_test(mlu_device_context_test SRCS device_context_test.cc DEPS mlu_device_context) cc_library(mlu_collective_helper SRCS mlu_collective_helper.cc DEPS mlu_stream mlu_info) +cc_library(mlu_resource_pool SRCS mlu_resource_pool.cc DEPS mlu_info) diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc new file mode 100644 index 0000000000000..fbe3eca1c4d23 --- /dev/null +++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(PADDLE_WITH_MLU) +#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h" + +namespace paddle { +namespace platform { + +MluStreamResourcePool::MluStreamResourcePool() { + int dev_cnt = platform::GetMLUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetMLUDeviceId(dev_idx); + mluStream stream; + cnrtQueueCreate(&stream); + return stream; + }; + + auto deleter = [dev_idx](mluStream stream) { + platform::SetMLUDeviceId(dev_idx); + cnrtQueueDestroy(stream); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +MluStreamResourcePool& MluStreamResourcePool::Instance() { + static MluStreamResourcePool pool; + return pool; +} + +std::shared_ptr MluStreamResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +MluEventResourcePool::MluEventResourcePool() { + int dev_cnt = platform::GetMLUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetMLUDeviceId(dev_idx); + mluEventHandle event; + cnrtNotifierCreate(&event); + return event; + }; + + auto deleter = [dev_idx](mluEventHandle event) { + platform::SetMLUDeviceId(dev_idx); + cnrtNotifierDestroy(event); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +MluEventResourcePool& MluEventResourcePool::Instance() { + static MluEventResourcePool pool; + return pool; +} + +std::shared_ptr MluEventResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/device/mlu/mlu_resource_pool.h b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h new file mode 100644 index 0000000000000..b0e2af7f024cb --- /dev/null +++ b/paddle/fluid/platform/device/mlu/mlu_resource_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_MLU) +#include +#include +#include + +#include "paddle/fluid/platform/device/mlu/mlu_info.h" +#include "paddle/fluid/platform/resource_pool.h" + +namespace paddle { +namespace platform { + +using MluStreamObject = std::remove_pointer::type; +using MluEventObject = std::remove_pointer::type; + +class MluStreamResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static MluStreamResourcePool &Instance(); + + private: + MluStreamResourcePool(); + + DISABLE_COPY_AND_ASSIGN(MluStreamResourcePool); + + private: + std::vector>> pool_; +}; + +class MluEventResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static MluEventResourcePool &Instance(); + + private: + MluEventResourcePool(); + + DISABLE_COPY_AND_ASSIGN(MluEventResourcePool); + + private: + std::vector>> pool_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 26c35167f404a..01b21d02ea017 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -37,6 +37,10 @@ if (WITH_ASCEND_CL) set(PYBIND_DEPS ${PYBIND_DEPS} heter_ccl_context) endif() +if (WITH_CNCL) + set(PYBIND_DEPS ${PYBIND_DEPS} cncl_context) +endif() + if(NOT WIN32) set(PYBIND_DEPS ${PYBIND_DEPS} data_loader) set(PYBIND_DEPS ${PYBIND_DEPS} mmap_allocator) @@ -134,6 +138,10 @@ if(WITH_PYTHON) list(APPEND OP_FUNCTION_GENERETOR_DEPS hccl_context) endif(WITH_ASCEND_CL) + if(WITH_CNCL) + list(APPEND OP_FUNCTION_GENERETOR_DEPS cncl_context) + endif(WITH_CNCL) + add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) add_executable(eager_op_function_generator eager_op_function_generator.cc) diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 387addda9edd1..8c5ed2d118301 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/bkcl_context.h" +#include "paddle/fluid/imperative/cncl_context.h" #include "paddle/fluid/imperative/data_loader.h" #include "paddle/fluid/imperative/gloo_context.h" #include "paddle/fluid/imperative/hccl_context.h" @@ -2559,6 +2560,18 @@ void BindImperative(py::module *m_ptr) { py::arg("ring_id")); #endif +#if defined(PADDLE_WITH_CNCL) + py::class_>( + m, "CNCLParallelContext") + .def(py::init()) + .def("init", [](imperative::CNCLParallelContext &self) { self.Init(); }) + .def("init_with_ring_id", + &imperative::CNCLParallelContext::InitWithRingID, + py::arg("ring_id")); +#endif + #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) py::class_ Date: Wed, 23 Feb 2022 17:42:11 +0800 Subject: [PATCH 03/85] [Phi] Polish default signature attr and output select impl (#39810) * polish default sig impl * revert dispenable out --- paddle/fluid/framework/pten_utils.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index b96eb848e43a4..0ecc04dbd6b8d 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -137,7 +137,7 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() { continue; } // If contains dispensable input, we should override the - // GetExpectedPtenKernelArgs method self + // OpArgumentMapping method self in phi/ops/compat dir if (in.has_dispensable() && in.dispensable()) { VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name; continue; @@ -153,7 +153,11 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() { for (int i = 0; i < op_proto_->outputs_size(); ++i) { auto& out = op_proto_->outputs()[i]; auto& out_name = out.name(); - // TODO(chenweihang): outputs also need skip some cases + if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) { + VLOG(6) << "Parse PtenKernel output: skip extra & quant output - " + << out_name; + continue; + } VLOG(6) << "Parse PtenKernel output: " << out_name; output_names_.emplace_back(out_name); } @@ -165,9 +169,10 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { for (int i = 0; i < op_proto_->attrs_size(); ++i) { auto& attr = op_proto_->attrs()[i]; auto& attr_name = attr.name(); - if (attr_name == "use_mkldnn" || attr_name == "op_role" || - attr_name == "op_role_var" || attr_name == "op_namescope" || - attr_name == "op_callstack" || attr_name == "op_device") { + if (attr_name == "use_mkldnn" || attr_name == "use_cudnn" || + attr_name == "op_role" || attr_name == "op_role_var" || + attr_name == "op_namescope" || attr_name == "op_callstack" || + attr_name == "op_device") { VLOG(6) << "Parse PtenKernel attribute: skip needless attr - " << attr_name; continue; From ad294a81fa340f439e75a41ba7c024a85d30b0e6 Mon Sep 17 00:00:00 2001 From: Yang Date: Wed, 23 Feb 2022 19:12:41 +0800 Subject: [PATCH 04/85] [Phi] move flip op to phi kernel (#39822) --- paddle/fluid/operators/flip_op.cc | 13 +-- paddle/fluid/operators/flip_op.cu | 129 ----------------------- paddle/fluid/operators/flip_op.h | 83 --------------- paddle/phi/kernels/cpu/flip_kernel.cc | 77 ++++++++++++++ paddle/phi/kernels/flip_kernel.h | 29 ++++++ paddle/phi/kernels/gpu/flip_kernel.cu | 141 ++++++++++++++++++++++++++ 6 files changed, 250 insertions(+), 222 deletions(-) delete mode 100644 paddle/fluid/operators/flip_op.cu delete mode 100644 paddle/fluid/operators/flip_op.h create mode 100644 paddle/phi/kernels/cpu/flip_kernel.cc create mode 100644 paddle/phi/kernels/flip_kernel.h create mode 100644 paddle/phi/kernels/gpu/flip_kernel.cu diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc index 3f6171b8a07b0..fc03ef0afae51 100644 --- a/paddle/fluid/operators/flip_op.cc +++ b/paddle/fluid/operators/flip_op.cc @@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/flip_op.h" #include #include #include + +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/complex.h" namespace paddle { namespace operators { @@ -29,6 +29,7 @@ class FlipOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; + // TODO move to phi kernel void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE_EQ( ctx->HasInput("X"), true, @@ -150,14 +151,6 @@ namespace plat = paddle::platform; REGISTER_OPERATOR(flip, ops::FlipOp, ops::FlipOpMaker, ops::FlipOpInferVarType, ops::FlipOpGradMaker, ops::FlipOpGradMaker); -REGISTER_OP_CPU_KERNEL( - flip, ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel>, - ops::FlipKernel>); /* ========================== register checkpoint ===========================*/ REGISTER_OP_VERSION(flip) diff --git a/paddle/fluid/operators/flip_op.cu b/paddle/fluid/operators/flip_op.cu deleted file mode 100644 index b9f8b16214fe4..0000000000000 --- a/paddle/fluid/operators/flip_op.cu +++ /dev/null @@ -1,129 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/flip_op.h" - -#include -#include "paddle/fluid/memory/malloc.h" -#include "paddle/fluid/platform/complex.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using CUDADeviceContext = paddle::platform::CUDADeviceContext; - -template -__global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data, - int64_t* x_shape, int64_t* x_stride, - int* flip_dims, int flip_dims_size, - int total_dims) { - int idx = blockIdx.x * blockDim.x + threadIdx.x; - if (idx >= N) { - return; - } - - int cur_indices = idx, rem = 0, dst_offset = 0; - for (int i = 0; i < total_dims; ++i) { - int64_t temp = cur_indices; - cur_indices = cur_indices / x_stride[i]; - rem = temp - cur_indices * x_stride[i]; - // flip the indices if it is in flip_dims - for (int j = 0; j < flip_dims_size; ++j) { - if (i == flip_dims[j]) { - cur_indices = x_shape[i] - 1 - cur_indices; - } - } - dst_offset += cur_indices * x_stride[i]; - cur_indices = rem; - } - out_data[idx] = in_data[dst_offset]; -} - -template -class FlipKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto gplace = ctx.GetPlace(); - auto cplace = platform::CPUPlace(); - auto& dev_ctx = ctx.template device_context(); - - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - auto* in_data = x->data(); - auto* out_data = out->mutable_data(ctx.GetPlace()); - auto flip_dims = ctx.template Attr>("axis"); - - const int flip_dims_size = static_cast(flip_dims.size()); - auto x_dims = x->dims(); - const int total_dims = x_dims.size(); - const int N = x->numel(); - - int block_size = 512; - dim3 dim_block(block_size); - dim3 dim_grid((N + block_size - 1) / block_size); - - for (size_t i = 0; i < flip_dims.size(); ++i) { - if (flip_dims[i] < 0) { - flip_dims[i] += total_dims; - } - } - - auto x_stride = phi::stride(x_dims); - std::vector x_dims_v = phi::vectorize(x_dims); - std::vector x_stride_v = phi::vectorize(x_stride); - - int bytes = total_dims * sizeof(int64_t); - auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes); - int64_t* x_strides_array_gpu = - reinterpret_cast(x_strides_array_tmp->ptr()); - memory::Copy(gplace, x_strides_array_gpu, cplace, x_stride_v.data(), bytes, - dev_ctx.stream()); - - auto x_shape_array_tmp = memory::Alloc(dev_ctx, bytes); - int64_t* x_shape_array_gpu = - reinterpret_cast(x_shape_array_tmp->ptr()); - memory::Copy(gplace, x_shape_array_gpu, cplace, x_dims_v.data(), bytes, - dev_ctx.stream()); - - bytes = flip_dims_size * sizeof(int); - auto flip_dims_array_tmp = memory::Alloc(dev_ctx, bytes); - int* flip_dims_array_gpu = - reinterpret_cast(flip_dims_array_tmp->ptr()); - memory::Copy(gplace, flip_dims_array_gpu, cplace, flip_dims.data(), bytes, - dev_ctx.stream()); - - flip_cuda_kernel< - T><<>>( - N, in_data, out_data, x_shape_array_gpu, x_strides_array_gpu, - flip_dims_array_gpu, flip_dims_size, total_dims); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - flip, ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel, - ops::FlipKernel>, - ops::FlipKernel>); diff --git a/paddle/fluid/operators/flip_op.h b/paddle/fluid/operators/flip_op.h deleted file mode 100644 index 3c00df5f67d19..0000000000000 --- a/paddle/fluid/operators/flip_op.h +++ /dev/null @@ -1,83 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -constexpr size_t dim_bitset_size = 64; - -template -class FlipKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override; -}; - -template -class FlipKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - Tensor* out = ctx.Output("Out"); - auto flip_dims = ctx.template Attr>("axis"); - - auto x_dims = x->dims(); - const int total_dims = x_dims.size(); - std::bitset dim_bitset; - for (size_t i = 0; i < flip_dims.size(); ++i) { - int dim = flip_dims[i]; - if (flip_dims[i] < 0) { - dim += total_dims; - } - dim_bitset[dim] = true; - } - auto x_strides = phi::stride(x_dims); - auto numel = x->numel(); - const T* x_data = x->data(); - T* out_data = out->mutable_data(ctx.GetPlace()); -#ifdef PADDLE_WITH_MKLML -#pragma omp parallel for -#endif - for (int64_t i = 0; i < numel; ++i) { - int64_t cur_indices = i; - int64_t rem = 0; - int64_t dst_offset = 0; - - for (int d = 0; d < total_dims; ++d) { - int64_t temp = cur_indices; - cur_indices = cur_indices / x_strides[d]; - rem = temp - cur_indices * x_strides[d]; - dst_offset += dim_bitset[d] - ? (x_dims[d] - 1 - cur_indices) * x_strides[d] - : cur_indices * x_strides[d]; - cur_indices = rem; - } - out_data[i] = x_data[dst_offset]; - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/flip_kernel.cc b/paddle/phi/kernels/cpu/flip_kernel.cc new file mode 100644 index 0000000000000..fa1625d65bdc9 --- /dev/null +++ b/paddle/phi/kernels/cpu/flip_kernel.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/flip_kernel.h" + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +constexpr size_t dim_bitset_size = 64; + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + auto x_dims = x.dims(); + const int total_dims = x_dims.size(); + std::bitset dim_bitset; + for (size_t i = 0; i < axis.size(); ++i) { + int dim = axis[i]; + if (axis[i] < 0) { + dim += total_dims; + } + dim_bitset[dim] = true; + } + auto x_strides = phi::stride(x_dims); + auto numel = x.numel(); + const T* x_data = x.data(); + T* out_data = dev_ctx.template Alloc(out); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (int64_t i = 0; i < numel; ++i) { + int64_t cur_indices = i; + int64_t rem = 0; + int64_t dst_offset = 0; + + for (int d = 0; d < total_dims; ++d) { + int64_t temp = cur_indices; + cur_indices = cur_indices / x_strides[d]; + rem = temp - cur_indices * x_strides[d]; + dst_offset += dim_bitset[d] ? (x_dims[d] - 1 - cur_indices) * x_strides[d] + : cur_indices * x_strides[d]; + cur_indices = rem; + } + out_data[i] = x_data[dst_offset]; + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(flip, + CPU, + ALL_LAYOUT, + phi::FlipKernel, + float, + double, + int32_t, + int64_t, + bool, + phi::dtype::complex, + phi::dtype::complex) {} diff --git a/paddle/phi/kernels/flip_kernel.h b/paddle/phi/kernels/flip_kernel.h new file mode 100644 index 0000000000000..4470486fec0fb --- /dev/null +++ b/paddle/phi/kernels/flip_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/flip_kernel.cu b/paddle/phi/kernels/gpu/flip_kernel.cu new file mode 100644 index 0000000000000..668d673bd3269 --- /dev/null +++ b/paddle/phi/kernels/gpu/flip_kernel.cu @@ -0,0 +1,141 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/flip_kernel.h" + +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +__global__ void flip_cuda_kernel(const int N, + const T* in_data, + T* out_data, + int64_t* x_shape, + int64_t* x_stride, + int* flip_dims, + int flip_dims_size, + int total_dims) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) { + return; + } + + int cur_indices = idx, rem = 0, dst_offset = 0; + for (int i = 0; i < total_dims; ++i) { + int64_t temp = cur_indices; + cur_indices = cur_indices / x_stride[i]; + rem = temp - cur_indices * x_stride[i]; + // flip the indices if it is in flip_dims + for (int j = 0; j < flip_dims_size; ++j) { + if (i == flip_dims[j]) { + cur_indices = x_shape[i] - 1 - cur_indices; + } + } + dst_offset += cur_indices * x_stride[i]; + cur_indices = rem; + } + out_data[idx] = in_data[dst_offset]; +} + +template +void FlipKernel(const Context& dev_ctx, + const DenseTensor& x, + const std::vector& axis, + DenseTensor* out) { + const auto gplace = dev_ctx.GetPlace(); + auto cplace = phi::CPUPlace(); + std::vector flip_dims = axis; + + auto* in_data = x.data(); + auto* out_data = dev_ctx.template Alloc(out); + + const int flip_dims_size = static_cast(flip_dims.size()); + auto x_dims = x.dims(); + const int total_dims = x_dims.size(); + const int N = x.numel(); + + int block_size = 512; + dim3 dim_block(block_size); + dim3 dim_grid((N + block_size - 1) / block_size); + + for (size_t i = 0; i < flip_dims.size(); ++i) { + if (flip_dims[i] < 0) { + flip_dims[i] += total_dims; + } + } + + auto x_stride = phi::stride(x_dims); + std::vector x_dims_v = phi::vectorize(x_dims); + std::vector x_stride_v = phi::vectorize(x_stride); + + int bytes = total_dims * sizeof(int64_t); + auto x_strides_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + int64_t* x_strides_array_gpu = + reinterpret_cast(x_strides_array_tmp->ptr()); + paddle::memory::Copy(gplace, + x_strides_array_gpu, + cplace, + x_stride_v.data(), + bytes, + dev_ctx.stream()); + + auto x_shape_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + int64_t* x_shape_array_gpu = + reinterpret_cast(x_shape_array_tmp->ptr()); + paddle::memory::Copy(gplace, + x_shape_array_gpu, + cplace, + x_dims_v.data(), + bytes, + dev_ctx.stream()); + + bytes = flip_dims_size * sizeof(int); + auto flip_dims_array_tmp = paddle::memory::Alloc(dev_ctx, bytes); + int* flip_dims_array_gpu = reinterpret_cast(flip_dims_array_tmp->ptr()); + paddle::memory::Copy(gplace, + flip_dims_array_gpu, + cplace, + flip_dims.data(), + bytes, + dev_ctx.stream()); + + flip_cuda_kernel<<>>( + N, + in_data, + out_data, + x_shape_array_gpu, + x_strides_array_gpu, + flip_dims_array_gpu, + flip_dims_size, + total_dims); +} +} // namespace phi + +PD_REGISTER_KERNEL(flip, + GPU, + ALL_LAYOUT, + phi::FlipKernel, + float, + double, + phi::dtype::float16, + int, + int64_t, + bool, + phi::dtype::complex, + phi::dtype::complex) {} From 30992ea059b90c7ad6380c4a1164486f9a0e7210 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Wed, 23 Feb 2022 19:26:18 +0800 Subject: [PATCH 05/85] [phi] move randperm to phi (#39816) * move randperm to phi * fix npu * fix memory::Copy --- paddle/fluid/operators/randperm_op.cc | 8 ---- paddle/fluid/operators/randperm_op.cu | 24 ---------- paddle/fluid/platform/device_context.cc | 1 + paddle/phi/core/device_context.cc | 35 ++++++++++++-- paddle/phi/core/device_context.h | 13 ++++++ paddle/phi/kernels/cpu/randperm_kernel.cc | 46 ++++++++++++++++++ paddle/phi/kernels/gpu/randperm_kernel.cu | 57 +++++++++++++++++++++++ paddle/phi/kernels/randperm_kernel.h | 28 +++++++++++ paddle/phi/ops/compat/randperm_sig.cc | 25 ++++++++++ 9 files changed, 201 insertions(+), 36 deletions(-) delete mode 100644 paddle/fluid/operators/randperm_op.cu create mode 100644 paddle/phi/kernels/cpu/randperm_kernel.cc create mode 100644 paddle/phi/kernels/gpu/randperm_kernel.cu create mode 100644 paddle/phi/kernels/randperm_kernel.h create mode 100644 paddle/phi/ops/compat/randperm_sig.cc diff --git a/paddle/fluid/operators/randperm_op.cc b/paddle/fluid/operators/randperm_op.cc index bdc2ea0b5bfbb..1b28ab3c133f7 100644 --- a/paddle/fluid/operators/randperm_op.cc +++ b/paddle/fluid/operators/randperm_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/randperm_op.h" #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -89,10 +88,3 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, paddle::operators::RandpermOpVarTypeInference); - -template -using kernel = - paddle::operators::RandpermKernel; - -REGISTER_OP_CPU_KERNEL(randperm, kernel, kernel, kernel, - kernel); diff --git a/paddle/fluid/operators/randperm_op.cu b/paddle/fluid/operators/randperm_op.cu deleted file mode 100644 index 7ed52a8fd25b1..0000000000000 --- a/paddle/fluid/operators/randperm_op.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/randperm_op.h" - -template -using kernel = - paddle::operators::RandpermKernel; - -REGISTER_OP_CUDA_KERNEL(randperm, kernel, kernel, kernel, - kernel); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 4282ec20623c9..6a7956628f804 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -172,6 +172,7 @@ inline void EmplaceDeviceContext( .get()); dev_ctx->SetGenerator(framework::DefaultCPUGenerator().get()); } + dev_ctx->SetHostGenerator(framework::DefaultCPUGenerator().get()); dev_ctx->SetHostAllocator( memory::allocation::AllocatorFacade::Instance() .GetAllocator(platform::CPUPlace()) diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index c3e0d2a75228b..9c1d85251f892 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -119,22 +119,39 @@ struct DeviceContext::Impl { gen, phi::errors::InvalidArgument( "Required generator shall not be nullptr, but received nullptr.")); - generator_ = gen; + device_generator_ = gen; } Generator* GetGenerator() const { PADDLE_ENFORCE_NOT_NULL( - generator_, + device_generator_, phi::errors::InvalidArgument("Required generator_ shall not be " "nullptr, but received nullptr.")); - return generator_; + return device_generator_; + } + + void SetHostGenerator(Generator* gen) { + PADDLE_ENFORCE_NOT_NULL( + gen, + phi::errors::InvalidArgument( + "Required generator shall not be nullptr, but received nullptr.")); + host_generator_ = gen; + } + + Generator* GetHostGenerator() const { + PADDLE_ENFORCE_NOT_NULL( + host_generator_, + phi::errors::InvalidArgument("Required generator_ shall not be " + "nullptr, but received nullptr.")); + return host_generator_; } private: const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; - Generator* generator_{nullptr}; + Generator* device_generator_{nullptr}; + Generator* host_generator_{nullptr}; }; DeviceContext::DeviceContext() { impl_ = std::make_unique(); } @@ -143,6 +160,8 @@ DeviceContext::DeviceContext(const DeviceContext& other) { impl_->SetHostAllocator(&other.GetHostAllocator()); impl_->SetAllocator(&other.GetAllocator()); impl_->SetZeroAllocator(&other.GetZeroAllocator()); + impl_->SetHostGenerator(other.GetHostGenerator()); + impl_->SetGenerator(other.GetGenerator()); } DeviceContext::DeviceContext(DeviceContext&& other) { @@ -224,4 +243,12 @@ void DeviceContext::SetGenerator(Generator* gen) { impl_->SetGenerator(gen); } Generator* DeviceContext::GetGenerator() const { return impl_->GetGenerator(); } +void DeviceContext::SetHostGenerator(Generator* gen) { + impl_->SetHostGenerator(gen); +} + +Generator* DeviceContext::GetHostGenerator() const { + return impl_->GetHostGenerator(); +} + } // namespace phi diff --git a/paddle/phi/core/device_context.h b/paddle/phi/core/device_context.h index 7c1411e3bef37..689f4e4e66d15 100644 --- a/paddle/phi/core/device_context.h +++ b/paddle/phi/core/device_context.h @@ -132,6 +132,19 @@ class DeviceContext { */ Generator* GetGenerator() const; + /** + * @brief Set the host generator for special op. + * + * @param Generator + */ + void SetHostGenerator(Generator*); + /** + * @brief Get the host generator object. + * + * @return Generator + */ + Generator* GetHostGenerator() const; + private: struct Impl; std::unique_ptr impl_; diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc new file mode 100644 index 0000000000000..28092c8df6d15 --- /dev/null +++ b/paddle/phi/kernels/cpu/randperm_kernel.cc @@ -0,0 +1,46 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/randperm_kernel.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void RandpermKernel(const Context& ctx, + int n, + DataType dtype, + DenseTensor* out) { + T* out_data = ctx.template Alloc(out); + auto gen_ptr = ctx.GetHostGenerator(); + auto engine = gen_ptr->GetCPUEngine(); + + for (int i = 0; i < n; ++i) { + out_data[i] = static_cast(i); + } + std::shuffle(out_data, out_data + n, *engine); +} + +} // namespace phi + +PD_REGISTER_KERNEL(randperm, + CPU, + ALL_LAYOUT, + phi::RandpermKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu new file mode 100644 index 0000000000000..f75f768b633a3 --- /dev/null +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/kernels/randperm_kernel.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void RandpermKernel(const Context& ctx, + int n, + DataType dtype, + DenseTensor* out) { + DenseTensor tmp; + tmp.Resize(phi::make_ddim({n})); + T* tmp_data = ctx.template HostAlloc(&tmp); + + auto gen_ptr = ctx.GetHostGenerator(); + auto engine = gen_ptr->GetCPUEngine(); + + for (int i = 0; i < n; ++i) { + tmp_data[i] = static_cast(i); + } + std::shuffle(tmp_data, tmp_data + n, *engine); + + T* out_data = ctx.template Alloc(out); + auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); + paddle::memory::Copy( + out->place(), out_data, tmp.place(), tmp_data, size, 0); +} + +} // namespace phi + +PD_REGISTER_KERNEL(randperm, + GPU, + ALL_LAYOUT, + phi::RandpermKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/randperm_kernel.h b/paddle/phi/kernels/randperm_kernel.h new file mode 100644 index 0000000000000..63bdac6da6fdc --- /dev/null +++ b/paddle/phi/kernels/randperm_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void RandpermKernel(const Context& ctx, + int n, + DataType dtype, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc new file mode 100644 index 0000000000000..14b28512e402a --- /dev/null +++ b/paddle/phi/ops/compat/randperm_sig.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(randperm, phi::RandpermOpArgumentMapping); From 95280a368b9f41e6f5ca3feff138ff82d6a56bf9 Mon Sep 17 00:00:00 2001 From: Sing_chan <51314274+betterpig@users.noreply.github.com> Date: Wed, 23 Feb 2022 19:41:33 +0800 Subject: [PATCH 06/85] move trunc_op's infere shape to phi (#39772) * move trunc_op's infere shape * modify according to risheng's comment --- paddle/fluid/operators/trunc_op.cc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/trunc_op.cc b/paddle/fluid/operators/trunc_op.cc index bd3dc002990a7..54f4deac80a74 100644 --- a/paddle/fluid/operators/trunc_op.cc +++ b/paddle/fluid/operators/trunc_op.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -21,14 +23,6 @@ namespace operators { class TruncOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "trunc"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "trunc"); - auto input_dims = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", input_dims); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class TruncOpMaker : public framework::OpProtoAndCheckerMaker { @@ -75,9 +69,13 @@ class TruncGradOpMaker : public framework::SingleGradOpMaker { } // namespace operators } // namespace paddle +DELCARE_INFER_SHAPE_FUNCTOR(trunc, TruncInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + namespace ops = paddle::operators; REGISTER_OPERATOR(trunc, ops::TruncOp, ops::TruncOpMaker, ops::TruncGradOpMaker, - ops::TruncGradOpMaker); + ops::TruncGradOpMaker, + TruncInferShapeFunctor); REGISTER_OPERATOR(trunc_grad, ops::TruncGradOp); From 96d530c1286936c78b5ad6869d926e159b4563b5 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Wed, 23 Feb 2022 19:46:02 +0800 Subject: [PATCH 07/85] move array_ref_test and small_vector_test into paddle/utils and format header macro define (#39831) --- paddle/testing/CMakeLists.txt | 2 -- paddle/utils/CMakeLists.txt | 2 ++ paddle/utils/any.h | 7 ++----- paddle/utils/array_ref.h | 7 ++----- paddle/{testing => utils}/array_ref_test.cc | 0 paddle/utils/flat_hash_map.h | 2 +- paddle/utils/none.h | 9 +++------ paddle/utils/optional.h | 8 +++----- paddle/utils/small_vector.h | 9 +++------ paddle/{testing => utils}/small_vector_test.cc | 0 10 files changed, 16 insertions(+), 30 deletions(-) rename paddle/{testing => utils}/array_ref_test.cc (100%) rename paddle/{testing => utils}/small_vector_test.cc (100%) diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index fe288ec2bf1d1..eace7c41f4a31 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -3,5 +3,3 @@ if(WITH_TESTING) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS init device_context memory gtest gflags) endif() -cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags) -cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags) diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt index 2b4803e353854..64c88a47b4393 100644 --- a/paddle/utils/CMakeLists.txt +++ b/paddle/utils/CMakeLists.txt @@ -1 +1,3 @@ add_subdirectory(string) +cc_test(small_vector_test SRCS small_vector_test.cc DEPS gtest gflags) +cc_test(array_ref_test SRCS array_ref_test.cc DEPS gtest gflags) diff --git a/paddle/utils/any.h b/paddle/utils/any.h index d0e72b7063579..148d3f45b56ec 100644 --- a/paddle/utils/any.h +++ b/paddle/utils/any.h @@ -6,8 +6,7 @@ // See http://www.boost.org/libs/any for Documentation. -#ifndef PADDLE_ANY_INCLUDED -#define PADDLE_ANY_INCLUDED +#pragma once // what: variant type boost::any // who: contributed by Kevlin Henney, @@ -168,12 +167,10 @@ template inline const ValueType *unsafe_any_cast(const any *operand) { return unsafe_any_cast(const_cast(operand)); } -} +} // namespace paddle // Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved. // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) - -#endif diff --git a/paddle/utils/array_ref.h b/paddle/utils/array_ref.h index 9b39e9775f97a..d2ab762bb154f 100644 --- a/paddle/utils/array_ref.h +++ b/paddle/utils/array_ref.h @@ -12,8 +12,7 @@ // //===----------------------------------------------------------------------===// -#ifndef PADDLE_UTILS_ARRAY_REF_H_ -#define PADDLE_UTILS_ARRAY_REF_H_ +#pragma once #include #include @@ -332,6 +331,4 @@ inline bool operator!=(SmallVectorImpl &LHS, ArrayRef RHS) { return !(LHS == RHS); } -} // end namespace paddle - -#endif // PADDLE_UTILS_ARRAY_REF_H_ +} // namespace paddle diff --git a/paddle/testing/array_ref_test.cc b/paddle/utils/array_ref_test.cc similarity index 100% rename from paddle/testing/array_ref_test.cc rename to paddle/utils/array_ref_test.cc diff --git a/paddle/utils/flat_hash_map.h b/paddle/utils/flat_hash_map.h index 07b7b5d3c821c..64a75fffa5767 100644 --- a/paddle/utils/flat_hash_map.h +++ b/paddle/utils/flat_hash_map.h @@ -1741,4 +1741,4 @@ struct power_of_two_std_hash : std::hash { typedef paddle::power_of_two_hash_policy hash_policy; }; -} // end namespace paddle +} // namespace paddle diff --git a/paddle/utils/none.h b/paddle/utils/none.h index 20d6f4d2c7dde..d2da8f26a118f 100644 --- a/paddle/utils/none.h +++ b/paddle/utils/none.h @@ -15,8 +15,7 @@ // You are welcome to contact the author at: // fernando_cacciola@hotmail.com // -#ifndef PADDLE_NONE_17SEP2003_HPP -#define PADDLE_NONE_17SEP2003_HPP +#pragma once namespace paddle { @@ -26,7 +25,7 @@ struct none_helper {}; typedef int detail::none_helper::*none_t; -} // namespace boost +} // namespace paddle // NOTE: Borland users have to include this header outside any precompiled // headers @@ -37,6 +36,4 @@ namespace paddle { none_t const none = ((none_t)0); -} // namespace boost - -#endif +} // namespace paddle diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h index 00d8ae28ee836..d2a9a3f11ef3c 100644 --- a/paddle/utils/optional.h +++ b/paddle/utils/optional.h @@ -17,8 +17,7 @@ // You are welcome to contact the author at: // fernando_cacciola@hotmail.com // -#ifndef PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP -#define PADDLE_OPTIONAL_OPTIONAL_FLC_19NOV2002_HPP +#pragma once #include #include @@ -27,6 +26,8 @@ #include "none.h" +namespace paddle { + // Daniel Wallin discovered that bind/apply.hpp badly interacts with the apply<> // member template of a factory as used in the optional<> implementation. // He proposed this simple fix which is to move the call to apply<> outside @@ -38,7 +39,6 @@ void construct(Factory const& factory, void* address) { } } -namespace paddle { template class optional; @@ -865,5 +865,3 @@ inline void optional_swap(optional& x, optional& y) { } // namespace optional_detail } // namespace paddle - -#endif diff --git a/paddle/utils/small_vector.h b/paddle/utils/small_vector.h index 48af2491b89f8..14cb8f410f460 100644 --- a/paddle/utils/small_vector.h +++ b/paddle/utils/small_vector.h @@ -18,8 +18,7 @@ // //===----------------------------------------------------------------------===// -#ifndef PADDLE_UTILS_SMALL_VECTOR_H_ -#define PADDLE_UTILS_SMALL_VECTOR_H_ +#pragma once #include #include @@ -1461,7 +1460,7 @@ static_assert(sizeof(SmallVectorSizeType) == sizeof(uint32_t), "Expected SmallVectorBase variant to be in use."); #endif -} // end namespace paddle +} // namespace paddle namespace std { @@ -1479,6 +1478,4 @@ inline void swap(paddle::SmallVector &LHS, LHS.swap(RHS); } -} // end namespace std - -#endif // PADDLE_UTILS_SMALL_VECTOR_H_ +} // namespace std diff --git a/paddle/testing/small_vector_test.cc b/paddle/utils/small_vector_test.cc similarity index 100% rename from paddle/testing/small_vector_test.cc rename to paddle/utils/small_vector_test.cc From ca11a0e5c2bc6a998adc7dc7d65c403cd38ec0f5 Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Wed, 23 Feb 2022 19:57:22 +0800 Subject: [PATCH 08/85] Support dispensable inputs for eager final state codegen (#39743) --- .../final_state_generator/eager_gen.py | 66 ++++++++++++------- .../final_state_generator/python_c_gen.py | 22 +++++-- paddle/fluid/pybind/eager_utils.cc | 26 ++++++++ paddle/fluid/pybind/eager_utils.h | 6 ++ 4 files changed, 94 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 0578f930679b8..c6e56e34627a5 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -143,6 +143,11 @@ def IntermediateValidationCheck(intermediate_outputs, forward_returns_list): assert pos in intermediate_positions +def ParseDispensable(string): + # string: "X, Y" + return [v.strip() for v in string.split(",")] + + def ParseIntermediate(string): return [v.strip() for v in string.split(",")] @@ -596,11 +601,11 @@ def GenerateNodeDefinition(fwd_api_name, bwd_api_name, backward_fwd_input_map, return node_definition_str -def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, - forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list): +def GenerateNodeCreationCodes( + fwd_api_name, bwd_api_name, forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, optional_inputs): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -674,10 +679,17 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, # SetTensorWrappers set_tensor_wrappers_list = [] for name, (_, is_fwd_input, _) in backward_fwd_input_map.items(): + is_optional = (name in optional_inputs) if is_fwd_input: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" + if is_optional: + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, true);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, true);" else: - set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" + if is_optional: + set_tensor_wrappers = f" if({name}.is_initialized()) grad_node->SetTensorWrapper{name}({name}, false);" + else: + set_tensor_wrappers = f" grad_node->SetTensorWrapper{name}({name}, false);" set_tensor_wrappers_list.append(set_tensor_wrappers) set_tensor_wrappers_str = "\n".join(set_tensor_wrappers_list) @@ -762,11 +774,12 @@ def GenerateNodeCreationCodes(fwd_api_name, bwd_api_name, return node_creation_str -def GenerateForwardDefinition( - fwd_api_name, bwd_api_name, forward_inputs_position_map, - forward_outputs_position_map, forward_attrs_list, - backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, intermediate_outputs): +def GenerateForwardDefinition(fwd_api_name, bwd_api_name, + forward_inputs_position_map, + forward_outputs_position_map, forward_attrs_list, + backward_fwd_input_map, backward_grad_input_map, + backward_grad_output_map, backward_attrs_list, + optional_inputs, intermediate_outputs): # fwd_api_name = "" # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } @@ -775,6 +788,7 @@ def GenerateForwardDefinition( # backward_grad_input_map = { "name" : [type, fwd_position, orig_position] ...} # backward_grad_output_map = { "name" : [type, fwd_position, orig_position] ...} # backward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] + # optional_inputs = ["name0", ...] # Get Function Args num_inputs = len(forward_attrs_list) + len(forward_inputs_position_map.keys( @@ -784,17 +798,18 @@ def GenerateForwardDefinition( inputs_call_list = ["" for i in range(num_inputs)] for name, (ttype, pos) in forward_inputs_position_map.items(): inputs_call_list[pos] = f"{name}" + is_optional = (name in optional_inputs) if IsPlainTensorType(ttype): - inputs_args_definition_list[ - pos] = f"const paddle::experimental::Tensor& {name}" - inputs_args_declaration_list[ - pos] = f"const paddle::experimental::Tensor& {name}" + if is_optional: + arg_str = f"const paddle::optional& {name}" + else: + arg_str = f"const paddle::experimental::Tensor& {name}" else: assert IsVectorTensorType(ttype) - inputs_args_definition_list[ - pos] = f"const std::vector& {name}" - inputs_args_declaration_list[ - pos] = f"const std::vector& {name}" + arg_str = f"const std::vector& {name}" + + inputs_args_definition_list[pos] = arg_str + inputs_args_declaration_list[pos] = arg_str for name, atype, default_val, pos in forward_attrs_list: inputs_call_list[pos] = name @@ -849,7 +864,7 @@ def GenerateForwardDefinition( fwd_api_name, bwd_api_name, forward_inputs_position_map, forward_outputs_position_map, forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list) + backward_grad_output_map, backward_attrs_list, optional_inputs) FORWARD_FUNCTION_TEMPLATE = """ {} {}({}) {{ @@ -1053,6 +1068,12 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): assert 'args' in bwd_api.keys() assert 'output' in bwd_api.keys() assert 'forward' in bwd_api.keys() + + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + bwd_forward_str = bwd_api['forward'] bwd_args_str = bwd_api['args'] bwd_returns_str = bwd_api['output'] @@ -1128,7 +1149,8 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str): fwd_api_name, bwd_api_name, forward_inputs_position_map, forward_outputs_position_map, forward_attrs_list, backward_fwd_input_map, backward_grad_input_map, - backward_grad_output_map, backward_attrs_list, intermediate_outputs) + backward_grad_output_map, backward_attrs_list, optional_inputs, + intermediate_outputs) print("Generated Forward Definition: ", forward_definition_str) print("Generated Forward Declaration: ", forward_declaration_str) forward_definition_str += definition_declaration_pair[0] diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index a95d6dce29aad..5a536067dbe49 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -14,7 +14,7 @@ import os import argparse -from eager_gen import ReadFwdFile, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap +from eager_gen import ReadFwdFile, ParseDispensable, IsVectorTensorType, GetForwardFunctionName, ParseYamlForward, DetermineForwardPositionMap atype_to_parsing_function = { "bool": "CastPyArg2Boolean", @@ -70,10 +70,12 @@ def FindParsingFunctionFromAttributeType(atype): def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, - forward_attrs_list, forward_outputs_position_map): + forward_attrs_list, forward_outputs_position_map, + optional_inputs): # forward_inputs_position_map = { "name" : [type, fwd_position] } # forward_outputs_position_map = { "name" : [type, fwd_position] } # forward_attrs_list = [ [attr_name, attr_type, default_value, orig_position], ...] + # optional_inputs = [name0, ...] # Get EagerTensor from args # Get dygraph function call args @@ -82,7 +84,14 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, dygraph_function_call_list = ["" for i in range(num_args)] get_eager_tensor_str = "" for name, (ttype, pos) in forward_inputs_position_map.items(): - get_eager_tensor_str += f" auto& {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + is_optional = (name in optional_inputs) + if IsVectorTensorType(ttype): + get_eager_tensor_str += f" auto {name} = GetTensorListFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + else: + if is_optional: + get_eager_tensor_str += f" auto {name} = GetOptionalTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" + else: + get_eager_tensor_str += f" auto {name} = GetTensorFromArgs(\"{fwd_api_name}\", \"{name}\", args, {pos}, false);\n" dygraph_function_call_list[pos] = f"{name}" parse_attributes_str = "" @@ -267,6 +276,11 @@ def GeneratePythonCFile(filepath, python_c_str): fwd_args_str = fwd_api['args'] fwd_returns_str = fwd_api['output'] + # Parse Dispensable Inputs + optional_inputs = [] + if 'optional' in fwd_api.keys(): + optional_inputs = ParseDispensable(fwd_api['optional']) + # Collect Original Forward Inputs/Outputs and then perform validation checks forward_inputs_list, forward_attrs_list, forward_returns_list = ParseYamlForward( fwd_args_str, fwd_returns_str) @@ -283,7 +297,7 @@ def GeneratePythonCFile(filepath, python_c_str): python_c_function_str, python_c_function_reg_str = GeneratePythonCFunction( fwd_api_name, forward_inputs_position_map, forward_attrs_list, - forward_outputs_position_map) + forward_outputs_position_map, optional_inputs) python_c_function_list.append(python_c_function_str) python_c_function_reg_list.append(python_c_function_reg_str) print("Generated Python-C Function: ", python_c_function_str) diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 9c033376d6c43..c1e8822eec221 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -555,6 +555,32 @@ PyObject* ToPyObject( return dict; } +// For Final State Dygraph, +// We directly use paddle::optional(Tensor) as dispensable Tensor +paddle::optional GetOptionalTensorFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable) { + PyObject* obj = PyTuple_GET_ITEM(args, arg_idx); + + if (PyTuple_Check(obj)) { + obj = PyTuple_GET_ITEM(obj, 0); + } + + if (obj == nullptr || obj == Py_None) { + if (!dispensable) { + PADDLE_THROW(platform::errors::InvalidArgument( + "%s(): argument '%s' (position %d) must be Tensor, but got None", + op_type, arg_name, arg_idx)); + } + return {}; + } + + return paddle::make_optional( + reinterpret_cast(obj)->tensor); +} + +// For Intermediate State Dygraph, +// we use an uninitialized Tensor to represent dispensable Tensor paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index fb19e108aeb70..0c721d6124791 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -89,10 +89,15 @@ PyObject* ToPyObject(const std::tuple& out) { return result; } +paddle::optional GetOptionalTensorFromArgs( + const std::string& op_type, const std::string& arg_name, PyObject* args, + ssize_t arg_idx, bool dispensable = false); + paddle::experimental::Tensor& GetTensorFromArgs(const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); + std::vector GetTensorListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); @@ -102,6 +107,7 @@ paddle::experimental::Tensor* GetTensorPtrFromArgs(const std::string& op_type, PyObject* args, ssize_t arg_idx, bool dispensable = false); + std::vector GetTensorPtrListFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, ssize_t arg_idx, bool dispensable = false); From 058e1d8592e44fc913365520455d124333224adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?= <39303645+Shixiaowei02@users.noreply.github.com> Date: Wed, 23 Feb 2022 20:56:53 +0800 Subject: [PATCH 09/85] infrt runtime supports phi, test=develop (#39836) * runtime supports pten kernels, test=develop * fixes a bug, test=develop --- paddle/infrt/dialect/phi/infrt_phi_base.td | 2 + paddle/infrt/dialect/phi/infrt_phi_kernel.td | 12 ++- paddle/infrt/dialect/phi/infrt_phi_tensor.td | 7 +- paddle/infrt/dialect/phi/phi_base.h | 14 ++++ paddle/infrt/host_context/kernel_frame.cc | 32 ++++++++ paddle/infrt/host_context/kernel_frame.h | 77 +++++++++++-------- paddle/infrt/host_context/kernel_utils.h | 18 +++-- .../infrt/host_context/kernel_utils_test.cc | 40 ++++++++++ .../host_context/mlir_to_runtime_translate.cc | 77 ++++++++++++------- paddle/infrt/host_context/op_executable.cc | 3 +- paddle/infrt/host_context/value.h | 13 +++- paddle/infrt/kernel/phi/context_kernels.cc | 2 +- paddle/infrt/kernel/phi/context_kernels.h | 2 +- .../infershaped_kernel_launcher.cc | 3 - .../phi/infershaped/phi_kernel_launcher.h | 29 +++++++ paddle/infrt/kernel/phi/registry.cc | 20 +++-- paddle/infrt/kernel/tensor_kernels.cc | 8 +- paddle/infrt/support/variant.h | 4 +- .../tests/dialect/pten/dense_tensor.mlir | 16 ++-- 19 files changed, 272 insertions(+), 107 deletions(-) diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/infrt_phi_base.td index e297fad86be75..907f912d9e638 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_base.td +++ b/paddle/infrt/dialect/phi/infrt_phi_base.td @@ -23,6 +23,8 @@ class ContextTypeOf traits=[]>: let summary = !strconcat("!phi.context_", place, " type"); } +def PhiOpTrait : NativeOpTrait<"PhiOpTrait">; + def CPU_Allocator : AllocatorTypeOf<"CPU">; def GPU_Allocator : AllocatorTypeOf<"GPU">; diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/infrt_phi_kernel.td index 9ae469605860b..879994907cc0d 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td +++ b/paddle/infrt/dialect/phi/infrt_phi_kernel.td @@ -1,7 +1,10 @@ #ifndef PHI_KERNEL #define PHI_KERNEL -include "paddle/infrt/dialect/phi/infrt_phi_tensor.td" +include "mlir/Interfaces/SideEffectInterfaces.td" +include "mlir/IR/OpBase.td" +include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/phi/infrt_phi_base.td" def PHI_KernelDialect : Dialect { let name = "phi_kernel"; @@ -14,12 +17,7 @@ def PHI_KernelDialect : Dialect { } // PHI Kernel related ops. -class PDT_Kernel traits = []> : Op { -} - -def FakeKernelOp : PDT_Kernel<"phi.matmul.host.fp32"> { - let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); - let results = (outs DenseTensor:$output); +class PDT_Kernel traits = []> : Op { } def PDCK_AbsOp : PDT_Kernel<"phi.abs.host.fp32"> { diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/infrt_phi_tensor.td index b4607f632c9b9..b7b3b061fdbe4 100644 --- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td +++ b/paddle/infrt/dialect/phi/infrt_phi_tensor.td @@ -18,7 +18,7 @@ def PHI_DenseTensorDialect : Dialect { } // PHI DenseTensor related Op. -class PDT_Op traits = []> : Op { +class PDT_Op traits = []> : Op { } class CreateDenseTensorOp @@ -53,4 +53,9 @@ def PDT_FillDenseTensorOp_f32 : FillDenseTensorOp; def PDT_CreateAllocatorOp_cpu : CreateCPUAllocatorOp; def PDT_CreateContextOp_cpu : CreateCPUContextOp; +def FakeKernelOp : PDT_Op<"fake_phi_kernel"> { + let arguments = (ins CPU_Context:$dev_ctx, DenseTensor:$x, DenseTensor:$y, BoolAttr:$transpose_x, BoolAttr:$transpose_y); + let results = (outs DenseTensor:$output); +} + #endif diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/phi_base.h index e3e58c2269620..11174290f92bd 100644 --- a/paddle/infrt/dialect/phi/phi_base.h +++ b/paddle/infrt/dialect/phi/phi_base.h @@ -25,6 +25,20 @@ #define GET_TYPEDEF_CLASSES #include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc" +namespace mlir { +namespace OpTrait { + +template +class PhiOpTrait : public OpTrait::TraitBase { + public: + static LogicalResult verifyTrait(Operation *op) { + return LogicalResult::success(); + } +}; + +} // namespace OpTrait +} // namespace mlir + namespace infrt { namespace phi {} // namespace phi } // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.cc b/paddle/infrt/host_context/kernel_frame.cc index 1acb35e898308..14e88be4b96bb 100644 --- a/paddle/infrt/host_context/kernel_frame.cc +++ b/paddle/infrt/host_context/kernel_frame.cc @@ -15,6 +15,7 @@ #include "paddle/infrt/host_context/kernel_frame.h" #include +#include namespace infrt { namespace host_context { @@ -25,5 +26,36 @@ std::ostream& operator<<(std::ostream& os, const KernelFrame& frame) { return os; } +#ifndef NDEBUG +std::string KernelFrame::DumpArgTypes() const { + std::stringstream ss; + for (auto* value : GetValues(0, GetNumElements())) { + if (value->is_type()) { + ss << "bool (" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "DenseHostTensor(" << &value->get() + << "), "; + } else if (value->is_type()) { + ss << "float(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "int(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "phi::DenseTensor(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "phi::MetaTensor(" << &value->get() << "), "; + } else if (value->is_type<::phi::CPUContext>()) { + ss << "phi::CPUContext(" << &value->get<::phi::CPUContext>() << "), "; + } else if (value->is_type()) { + ss << "none(" << &value->get() << "), "; + } else if (value->is_type()) { + ss << "CpuPhiContext(" << &value->get() << "), "; + } else { + ss << "typeid: " << value->index() << ", "; + } + } + return ss.str(); +} +#endif + } // namespace host_context } // namespace infrt diff --git a/paddle/infrt/host_context/kernel_frame.h b/paddle/infrt/host_context/kernel_frame.h index 35527872e624f..90887edb99166 100644 --- a/paddle/infrt/host_context/kernel_frame.h +++ b/paddle/infrt/host_context/kernel_frame.h @@ -31,20 +31,24 @@ namespace host_context { class KernelFrame { public: int GetNumArgs() const { return num_arguments_; } - int GetNumResults() const { return num_results_ == -1 ? 0 : num_results_; } - int GetNumAttributes() const { - return value_or_attrs_.size() - num_arguments_ - - (num_results_ == -1 ? 0 : num_results_); + int GetNumResults() const { + return value_or_attrs_.size() - num_arguments_ - GetNumAttributes(); } + int GetNumAttributes() const { return num_attrs_ == -1 ? 0 : num_attrs_; } //! Get something at a specific position \p index. The element might be an //! argument, an attribute or a result. template T& GetElementAt(int index) { - CHECK_LT(index, GetNumArgs() + GetNumAttributes() + GetNumResults()); + CHECK_LT(static_cast(index), GetNumElements()); return value_or_attrs_[index]->template get_or_default(); } + Value* GetElementAt(int index) { + CHECK_LT(static_cast(index), GetNumElements()); + return value_or_attrs_[index]; + } + // Get number of elements, either input, attributes or results. size_t GetNumElements() const { return value_or_attrs_.size(); } @@ -70,18 +74,21 @@ class KernelFrame { } Value* GetAttributeAt(int idx) { - CHECK_NE(num_results_, -1) - << "Must call SetNumResults before GetAttributeAt"; - CHECK_LT(idx, - static_cast(value_or_attrs_.size() - num_arguments_ - - num_results_)); - return value_or_attrs_[num_arguments_ + num_results_ + idx]; + // CHECK_NE(num_results_, -1) + //<< "Must call SetNumResults before GetAttributeAt"; + CHECK_LT(idx, GetNumAttributes()); + return value_or_attrs_[num_arguments_ + idx]; } void AddAttribute(Value* v) { - CHECK_NE(num_results_, -1) - << "Must call SetNumResults before calling AddAttribute"; + CHECK_LE(num_results_, 0) + << "Must call SetNumResults after calling AddAttribute"; value_or_attrs_.emplace_back(v); + if (num_attrs_ == -1) num_attrs_ = 0; + num_attrs_++; + + CHECK_EQ(value_or_attrs_.size(), + static_cast(num_arguments_ + num_attrs_)); } template @@ -96,35 +103,43 @@ class KernelFrame { template void SetResultAt(int index, T&& value) { - CHECK_LT(index, num_results_) << "Invalid result index"; - CHECK(value_or_attrs_[num_arguments_ + index]); - value_or_attrs_[num_arguments_ + index]->set(std::move(value)); + CHECK_LT(index, GetNumResults()) << "Invalid result index"; + CHECK(value_or_attrs_[num_arguments_ + GetNumAttributes() + index]); + value_or_attrs_[num_arguments_ + GetNumAttributes() + index]->set( + std::move(value)); } llvm::ArrayRef GetResults() const { - return GetValues(num_arguments_, num_results_); + CHECK_GE(num_results_, 0) << "Invalid results num"; + return GetValues(num_arguments_ + GetNumAttributes(), num_results_); } llvm::MutableArrayRef GetResults() { - return GetMutableValues(num_arguments_, num_results_); + CHECK_GE(num_results_, 0) << "Invalid results num"; + return GetMutableValues(num_arguments_ + GetNumAttributes(), num_results_); } llvm::ArrayRef GetValues(size_t from, size_t length) const { - CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + CHECK_LE(from + length, GetNumElements()); if (length == 0) return {}; return llvm::makeArrayRef(&value_or_attrs_[from], length); } llvm::MutableArrayRef GetMutableValues(size_t from, size_t length) { - CHECK_LE(static_cast(from + length), num_arguments_ + num_results_); + CHECK_LE(from + length, GetNumElements()); if (length == 0) return {}; return llvm::makeMutableArrayRef(&value_or_attrs_[from], length); } +#ifndef NDEBUG + std::string DumpArgTypes() const; +#endif + bool IsEmpty() const { return value_or_attrs_.empty(); } protected: int num_arguments_{}; + int num_attrs_{-1}; int num_results_{-1}; llvm::SmallVector value_or_attrs_; @@ -136,15 +151,15 @@ class KernelFrameBuilder : public KernelFrame { public: void AddArgument(Value* value) { CHECK(value); - CHECK_EQ(num_results_, -1) - << "Should call AddArgument before calling SetNumResults"; + CHECK_EQ(num_attrs_, -1) + << "Should call AddArgument before calling SetAttributes"; value_or_attrs_.push_back(value); ++num_arguments_; } void SetResults(llvm::ArrayRef values) { - CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); - CHECK_EQ(num_results_, -1); + CHECK_EQ(num_arguments_ + GetNumAttributes(), + static_cast(value_or_attrs_.size())); for (Value* x : values) { value_or_attrs_.push_back(x); } @@ -152,28 +167,30 @@ class KernelFrameBuilder : public KernelFrame { } void SetNumResults(size_t n) { - CHECK_EQ(num_arguments_, static_cast(value_or_attrs_.size())); - CHECK_EQ(num_results_, -1); - num_results_ = n; + CHECK_EQ(num_arguments_ + GetNumAttributes(), + static_cast(value_or_attrs_.size())); for (size_t i = 0; i < n; i++) { value_or_attrs_.emplace_back(new Value); } + num_results_ = n; } void SetResultAt(int result_id, Value* value) { CHECK_EQ(static_cast(value_or_attrs_.size()), - num_arguments_ + num_results_) + num_arguments_ + GetNumAttributes() + num_results_) << "Call SetNumResults first"; - CHECK_LT(result_id + num_arguments_, + CHECK_LT(result_id + num_arguments_ + GetNumAttributes(), static_cast(value_or_attrs_.size())); CHECK(value); - value_or_attrs_[num_arguments_ + result_id]->set(value); + value_or_attrs_[num_arguments_ + GetNumAttributes() + result_id]->set( + value); } void Reset() { value_or_attrs_.clear(); num_arguments_ = 0; num_results_ = -1; + num_attrs_ = -1; } }; diff --git a/paddle/infrt/host_context/kernel_utils.h b/paddle/infrt/host_context/kernel_utils.h index 31d411006d237..2f630dcc213cb 100644 --- a/paddle/infrt/host_context/kernel_utils.h +++ b/paddle/infrt/host_context/kernel_utils.h @@ -209,9 +209,11 @@ struct KernelImpl { static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { static_assert(out_idx != -1, "Do not place Results after RemainingResults"); - static_assert(const_idx == 0, - "Arguments and results should appear before attributes"); - Result arg(&frame->GetResults()[out_idx]); + // static_assert(const_idx == 0, + // "Arguments and results should appear before attributes"); + + // Result arg(&frame->GetResults()[out_idx]); + Result arg(new ValueRef()); KernelCallHelper< Tail...>::template Invoke(frame, pargs..., @@ -224,8 +226,8 @@ struct KernelImpl { struct KernelCallHelper, Tail...> { template static void Invoke(KernelFrame* frame, const PreviousArgs&... pargs) { - static_assert(const_idx != -1, - "Do not place Attributes after RemainingAttributes"); + // static_assert(const_idx != -1, + // "Do not place Attributes after RemainingAttributes"); Attribute arg(frame->GetAttributeAt(const_idx)); KernelCallHelper< Tail...>::template Invoke(frame, @@ -242,8 +244,8 @@ struct KernelImpl { static_assert(in_idx != -1, "Do not place Arguments after RemainingArguments"); static_assert(out_idx == 0, "Arguments should appear before results"); - static_assert(const_idx == 0, - "Arguments and results should appear before attributes."); + // static_assert(const_idx == 0, + // "Arguments and results should appear before attributes."); auto* arg = &frame->template GetElementAt(in_idx); KernelCallHelper< Tail...>::template Invoke(frame, @@ -265,7 +267,7 @@ struct KernelImpl { static_assert(const_idx == 0, "Arguments and results should appear before attributes."); - auto* value = frame->GetArgAt(in_idx); + auto* value = frame->GetElementAt(in_idx); auto&& arg = value->get(); KernelCallHelper< diff --git a/paddle/infrt/host_context/kernel_utils_test.cc b/paddle/infrt/host_context/kernel_utils_test.cc index bebd8d86e50bb..71d8904eb798f 100644 --- a/paddle/infrt/host_context/kernel_utils_test.cc +++ b/paddle/infrt/host_context/kernel_utils_test.cc @@ -67,5 +67,45 @@ TEST(KernelImpl, pair) { ASSERT_EQ(results[1]->get(), 3.f); } +void TestFunc(const std::string& arg_0, + const std::string& arg_1, + const std::string& arg_2, + Attribute attr_0, + Result res_0, + Result res_1) { + CHECK_EQ(arg_0, "arg_0"); + CHECK_EQ(arg_1, "arg_1"); + CHECK_EQ(arg_2, "arg_2"); + CHECK_EQ(attr_0.get(), "attr_0"); + + // res_0.Set(Argument(ValueRef(new Value()))); + // res_1.Set(Argument(ValueRef(new Value()))); +} + +TEST(KernelRegistry, basic) { + KernelFrameBuilder kernel_frame; + + Value arg_0(std::string{"arg_0"}); + Value arg_1(std::string{"arg_1"}); + Value arg_2(std::string{"arg_2"}); + Value attr_0(std::string{"attr_0"}); + + kernel_frame.AddArgument(&arg_0); + kernel_frame.AddArgument(&arg_1); + kernel_frame.AddArgument(&arg_2); + kernel_frame.AddAttribute(&attr_0); + kernel_frame.SetNumResults(2); + + CHECK_EQ(kernel_frame.GetNumArgs(), 3); + CHECK_EQ(kernel_frame.GetNumResults(), 2); + CHECK_EQ(kernel_frame.GetNumAttributes(), 1); + CHECK_EQ(kernel_frame.GetNumElements(), 6UL); + + CHECK_EQ(kernel_frame.GetArgAt(2), "arg_2"); + CHECK_EQ(kernel_frame.GetAttributeAt(0)->get(), "attr_0"); + + KernelImpl::Invoke(&kernel_frame); +} + } // namespace host_context } // namespace infrt diff --git a/paddle/infrt/host_context/mlir_to_runtime_translate.cc b/paddle/infrt/host_context/mlir_to_runtime_translate.cc index b47e2b27eab7c..17e6f7cb563d2 100644 --- a/paddle/infrt/host_context/mlir_to_runtime_translate.cc +++ b/paddle/infrt/host_context/mlir_to_runtime_translate.cc @@ -31,6 +31,7 @@ #include "boost/optional.hpp" #include "paddle/infrt/common/string.h" +#include "paddle/infrt/dialect/dense_tensor.h" #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/tensor_shape.h" #include "paddle/infrt/host_context/core_runtime.h" @@ -150,6 +151,17 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( return boost::none; } +template <> +boost::optional MlirToRuntimeTranslator::EmitAttribute( + const mlir::Attribute& attr) { + if (!attr.isa()) return boost::none; + if (attr.isa()) { + auto val = attr.cast(); + return val.getValue(); + } + return boost::none; +} + template <> boost::optional MlirToRuntimeTranslator::EmitAttribute( const mlir::Attribute& attr) { @@ -187,6 +199,7 @@ boost::optional MlirToRuntimeTranslator::EmitAttribute( return res; \ } +PROCESS_ARRAY_INT(bool, 1); PROCESS_ARRAY_INT(int16_t, 16); PROCESS_ARRAY_INT(int32_t, 32); PROCESS_ARRAY_INT(int64_t, 64); @@ -262,25 +275,6 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { << GetValue(operand) << " vs " << arg_value; } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - res_values.push_back(AddValue(res)); - - VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); - } - impl_->cur_op->SetResults(res_values); - -#ifdef INFRT_DEBUG - { - VLOG(3) << "check result"; - for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { - VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; - } - } -#endif - // process attributes auto attrs = op->getAttrs(); @@ -296,6 +290,8 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { impl_->cur_op->AppendAttribute(new Value(*v)); } else if (auto v = EmitAttribute(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); + } else if (auto v = EmitAttribute(attr.getValue())) { + impl_->cur_op->AppendAttribute(new Value(*v)); } else if (auto v = EmitAttribute>(attr.getValue())) { impl_->cur_op->AppendAttribute(new Value(std::move(*v))); } else if (auto v = EmitAttribute>(attr.getValue())) { @@ -311,6 +307,33 @@ bool MlirToRuntimeTranslator::EmitGeneralOp(mlir::Operation* op) { } } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + if (res.getType().isa<::infrt::DenseTensorType>()) { + auto r = impl_->value_map.try_emplace( + res, ValueRef(new Value{::phi::DenseTensor()})); + CHECK(r.second) << "Duplicate add mlir value [" << DumpToString(res) + << "]"; + res_values.push_back(r.first->second.get()); + } else { + res_values.push_back(AddValue(res)); + } + + VLOG(3) << "* op mlir res: " << DumpToString(res) << " " << GetValue(res); + } + impl_->cur_op->SetResults(res_values); + +#ifdef INFRT_DEBUG + { + VLOG(3) << "check result"; + for (int i = 0; i < impl_->cur_op->frame().GetNumResults(); i++) { + VLOG(3) << "+ res value: " << impl_->cur_op->frame().GetResults()[i]; + } + } +#endif + // process regions, we treat regions as attribute. auto num_regions = op->getNumRegions(); if (num_regions > 0) { @@ -440,14 +463,6 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, impl_->cur_op->AppendArgument(arg_value); } - // process results - llvm::SmallVector res_values; - for (int i = 0, e = op->getNumResults(); i < e; i++) { - auto res = op->getResult(i); - res_values.push_back(AddValue(res)); - } - impl_->cur_op->SetResults(res_values); - // process attribute auto& table = function_table ? *function_table : impl_->func_defs; { @@ -460,6 +475,14 @@ bool MlirToRuntimeTranslator::EmitCallOp(mlir::Operation* op, impl_->cur_op->AppendAttribute(new Value(function)); } + // process results + llvm::SmallVector res_values; + for (int i = 0, e = op->getNumResults(); i < e; i++) { + auto res = op->getResult(i); + res_values.push_back(AddValue(res)); + } + impl_->cur_op->SetResults(res_values); + VLOG(3) << "Emit call " << callee_name.getValue().str() << " " << impl_->cur_op->frame(); return true; diff --git a/paddle/infrt/host_context/op_executable.cc b/paddle/infrt/host_context/op_executable.cc index cf40d7315c6a5..59a73e7108328 100644 --- a/paddle/infrt/host_context/op_executable.cc +++ b/paddle/infrt/host_context/op_executable.cc @@ -133,7 +133,8 @@ void OpExecutable::Execute() { VLOG(3) << "execute " << name() << " --- frame args: " << impl_->frame.GetNumArgs() << " results " << impl_->frame.GetNumResults() << " attributes " - << impl_->frame.GetNumAttributes(); + << impl_->frame.GetNumAttributes() << "\n" + << frame().DumpArgTypes(); for (int i = 0; i < impl_->frame.GetNumArgs(); i++) { VLOG(3) << "function arg: " << impl_->frame.GetArgAt(i); } diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h index 21c06c4bfd8f4..eb9a2092657aa 100644 --- a/paddle/infrt/host_context/value.h +++ b/paddle/infrt/host_context/value.h @@ -45,10 +45,13 @@ namespace infrt { namespace host_context { +struct None {}; + struct MlirFunctionExecutable; using ValueVariantType = - Variant const T& get() const { - CHECK(data.template is()); + CHECK(data.template is()) << "typeid: " << data.index() + << " != " << ValueVariantType::IndexOf; return data.get(); } template T& get() { - CHECK(data.template is()); + CHECK(data.template is()) << "typeid: " << data.index() + << " != " << ValueVariantType::IndexOf; return data.get(); } @@ -153,6 +158,8 @@ class Value : public common::Object { const char* type_info() const override; + ValueVariantType::IndexT index() const { return data.index(); } + friend void CopyTo(const Value& from, Value* to); private: diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc index ff9ae50bc4345..5284f499916c3 100644 --- a/paddle/infrt/kernel/phi/context_kernels.cc +++ b/paddle/infrt/kernel/phi/context_kernels.cc @@ -18,7 +18,7 @@ namespace infrt { namespace kernel { namespace phi { -backends::CpuPhiContext CreateCpuContext() { return {}; } +::phi::CPUContext CreateCpuContext() { return {}; } } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/context_kernels.h b/paddle/infrt/kernel/phi/context_kernels.h index 6fe1a01f770db..8082dc6c2ff29 100644 --- a/paddle/infrt/kernel/phi/context_kernels.h +++ b/paddle/infrt/kernel/phi/context_kernels.h @@ -21,7 +21,7 @@ namespace infrt { namespace kernel { namespace phi { -backends::CpuPhiContext CreateCpuContext(); +::phi::CPUContext CreateCpuContext(); } // namespace phi } // namespace kernel diff --git a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc index 62b204b160448..165f7f7c94377 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.cc @@ -26,9 +26,6 @@ void InferShapedKernelLauncher::CreateKernelFrameForInferShape( if (value->is_type<::phi::DenseTensor>()) { values.emplace_back(::phi::MetaTensor{&value->get<::phi::DenseTensor>()}); infershape_kernel_frame_builder.AddArgument(values.back().get()); - } else if (value->is_type()) { - values.emplace_back(phi::MetaTensor{&value->get()}); - infershape_kernel_frame_builder.AddArgument(values.back().get()); } else { infershape_kernel_frame_builder.AddArgument(value); } diff --git a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h index 713f7df7f5225..a0a5b391ea669 100644 --- a/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h +++ b/paddle/infrt/kernel/phi/infershaped/phi_kernel_launcher.h @@ -14,7 +14,9 @@ #pragma once #include +#include +#include "paddle/infrt/backends/host/phi_context.h" #include "paddle/infrt/host_context/kernel_utils.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_kernel_launcher.h" #include "paddle/infrt/kernel/phi/infershaped/infershaped_utils.h" @@ -22,6 +24,26 @@ namespace infrt { namespace kernel { +static void FakePhiInferShape(const ::phi::MetaTensor& a, + const ::phi::MetaTensor& b, + bool arg_0, + bool arg_1, + ::phi::MetaTensor* c) { + LOG(INFO) << "the ptr of c: " << c; + LOG(INFO) << "c->numel(): " << c->numel(); +} + +static void FakePhiKernel(const ::phi::CPUContext& /*Context*/, + const ::phi::DenseTensor& a, + const ::phi::DenseTensor& b, + bool arg_0, + bool arg_1, + ::phi::DenseTensor* c) { + std::cout << "@FakePhiKernel@" << std::endl; + LOG(INFO) << "the ptr of c: " << c; + LOG(INFO) << "c->numel(): " << c->numel(); +} + template ::count}; static const bool turn_on_infer_shape_cache{true}; void Invoke(host_context::KernelFrame* frame) override { +#ifndef NDEBUG + LOG(INFO) << "Kernel.frame: " << frame->DumpArgTypes(); +#endif // Build the infershape KernelFrame if needed. // TODO(Superjomn) add unlikely here. if (infershape_kernel_frame_builder.IsEmpty()) { CreateKernelFrameForInferShape(frame); +#ifndef NDEBUG + LOG(INFO) << "infershape.frame: " + << infershape_kernel_frame_builder.DumpArgTypes(); +#endif } if (turn_on_infer_shape_cache) { if (!turn_on_infer_shape_cache || IsShapeChanged(num_input_tensors)) { diff --git a/paddle/infrt/kernel/phi/registry.cc b/paddle/infrt/kernel/phi/registry.cc index f4f0e75a987a2..5d79814d4bec7 100644 --- a/paddle/infrt/kernel/phi/registry.cc +++ b/paddle/infrt/kernel/phi/registry.cc @@ -43,17 +43,15 @@ void RegisterPhiKernels(host_context::KernelRegistry* registry) { registry->AddKernel("phi_dt.fill_dense_tensor.f32", INFRT_KERNEL(infrt::kernel::phi::FillDenseTensorF32)); registry->AddKernel( - "phi.matmul.host.fp32", - std::bind(&kernel::KernelLauncherFunc< - decltype(&::phi::MatmulKernel), - &::phi::MatmulKernel, - decltype(&::phi::MatmulInferMeta), - &::phi::MatmulInferMeta>, - kernel::KernelLauncher< - decltype(&::phi::MatmulKernel), - &::phi::MatmulKernel, - decltype(&::phi::MatmulInferMeta), - &::phi::MatmulInferMeta>(), + "phi_dt.fake_phi_kernel", + std::bind(&KernelLauncherFunc, + KernelLauncher(), std::placeholders::_1)); } diff --git a/paddle/infrt/kernel/tensor_kernels.cc b/paddle/infrt/kernel/tensor_kernels.cc index 1e55bcd07ae80..9de1350e97d1a 100644 --- a/paddle/infrt/kernel/tensor_kernels.cc +++ b/paddle/infrt/kernel/tensor_kernels.cc @@ -45,7 +45,7 @@ void PrintTensor(const DenseHostTensor &tensor) { } template -void FillTensorWithConstant(DenseHostTensor *tensor, Attribute v) { +void FillTensorWithConstant(Attribute v, DenseHostTensor *tensor) { MutableDTArrayView(tensor).Fill(v.get()); } @@ -53,13 +53,11 @@ TensorMap LoadParams(const std::string &path) { return *(infrt::tensor::LoadParams(path)); } -void TensorMapGetTensor(TensorMap map, - DenseHostTensor *out, - Attribute name) { +DenseHostTensor TensorMapGetTensor(TensorMap map, Attribute name) { auto it = map.find(name.get()); CHECK(it != map.end()) << "No tensor called " << name.get() << " in the TensorMap"; - *out = *it->second; + return *it->second; } int32_t TensorMapGetSize(TensorMap map) { return map.size(); } diff --git a/paddle/infrt/support/variant.h b/paddle/infrt/support/variant.h index 2f415b21c8010..b8dcd21ae27fe 100644 --- a/paddle/infrt/support/variant.h +++ b/paddle/infrt/support/variant.h @@ -136,12 +136,12 @@ class Variant { return nullptr; } - IndexT index() { return index_; } + IndexT index() const { return index_; } - private: template static constexpr size_t IndexOf = TupleIndexOf::value; + private: static constexpr size_t kStorageSize = std::max({sizeof(Ts)...}); static constexpr size_t kAlignment = std::max({alignof(Ts)...}); diff --git a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir index 21ba15d5fce7d..f0b0b849b93cb 100644 --- a/paddle/infrt/tests/dialect/pten/dense_tensor.mlir +++ b/paddle/infrt/tests/dialect/pten/dense_tensor.mlir @@ -1,11 +1,13 @@ -// RUN: infrtopt %s | FileCheck %s +// RUN: infrtexec -i %s | FileCheck %s -// CHECK-LABEL: @basic_tensor -func @basic_tensor() { - %a = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator - %b = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context - %c = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%a) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) - // "phi_dt.fill_dense_tensor.f32" (%c) {value=[1.0:f32]} : (!Infrt.tensor) -> () +// CHECK-LABEL: @fake_phi_kernel_execute +func @fake_phi_kernel_execute() { + %allocator = "phi_dt.create_allocator.cpu" (): () -> !phi.CPU_allocator + %ctx = "phi_dt.create_context.cpu" (): () -> !phi.CPU_context + %t = "phi_dt.create_dense_tensor.cpu.f32.nchw" (%allocator) {dims=[1:i64], lod=[1:i64]}: (!phi.CPU_allocator) -> (!infrt.dense_tensor) + // CHECK: @FakePhiKernel@ + %d = "phi_dt.fake_phi_kernel" (%ctx, %t, %t) {transpose_x=false, transpose_y=false} : (!phi.CPU_context, !infrt.dense_tensor, !infrt.dense_tensor) -> (!infrt.dense_tensor) Infrt.return } + From 0b2058172ffab252f011fe59cddc75ab0d92faf8 Mon Sep 17 00:00:00 2001 From: ShenLiang <1422485404@qq.com> Date: Wed, 23 Feb 2022 21:38:39 +0800 Subject: [PATCH 10/85] Add ProcessGroupNCCL for distributed training (#39737) * add processgroup_nccl --- paddle/fluid/distributed/CMakeLists.txt | 2 +- .../distributed/collective/CMakeLists.txt | 5 + .../fluid/distributed/collective/NCCLTools.h | 198 +++++++++++ .../distributed/collective/ProcessGroup.cc | 40 +++ .../distributed/collective/ProcessGroup.h | 108 ++++++ .../collective/ProcessGroupNCCL.cc | 321 ++++++++++++++++++ .../distributed/collective/ProcessGroupNCCL.h | 126 +++++++ paddle/fluid/distributed/collective/Types.h | 36 ++ paddle/fluid/platform/cuda_device_guard.h | 24 +- .../fluid/platform/device/gpu/nccl_helper.h | 17 + paddle/fluid/platform/flags.cc | 12 + paddle/fluid/pybind/CMakeLists.txt | 8 + paddle/fluid/pybind/distributed_py.cc | 149 ++++++++ paddle/fluid/pybind/distributed_py.h | 29 ++ paddle/fluid/pybind/pybind.cc | 4 + .../fluid/tests/unittests/CMakeLists.txt | 4 + .../tests/unittests/process_group_nccl.py | 149 ++++++++ .../test_collective_process_group.py | 27 ++ 18 files changed, 1253 insertions(+), 6 deletions(-) create mode 100644 paddle/fluid/distributed/collective/CMakeLists.txt create mode 100644 paddle/fluid/distributed/collective/NCCLTools.h create mode 100644 paddle/fluid/distributed/collective/ProcessGroup.cc create mode 100644 paddle/fluid/distributed/collective/ProcessGroup.h create mode 100644 paddle/fluid/distributed/collective/ProcessGroupNCCL.cc create mode 100644 paddle/fluid/distributed/collective/ProcessGroupNCCL.h create mode 100644 paddle/fluid/distributed/collective/Types.h create mode 100644 paddle/fluid/pybind/distributed_py.cc create mode 100644 paddle/fluid/pybind/distributed_py.h create mode 100644 python/paddle/fluid/tests/unittests/process_group_nccl.py create mode 100644 python/paddle/fluid/tests/unittests/test_collective_process_group.py diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index 1527b752c6906..06b0583eddf24 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -1,5 +1,5 @@ +add_subdirectory(collective) add_subdirectory(store) - if(NOT WITH_PSCORE) add_subdirectory(fleet_executor) return() diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt new file mode 100644 index 0000000000000..5daaf29ae2895 --- /dev/null +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -0,0 +1,5 @@ +cc_library(processgroup SRCS ProcessGroup.cc DEPS pten pten_api eager_api) + +if(WITH_NCCL) + cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context pten pten_api eager_api) +endif() diff --git a/paddle/fluid/distributed/collective/NCCLTools.h b/paddle/fluid/distributed/collective/NCCLTools.h new file mode 100644 index 0000000000000..f30b96e72d453 --- /dev/null +++ b/paddle/fluid/distributed/collective/NCCLTools.h @@ -0,0 +1,198 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "boost/variant.hpp" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace distributed { + +#define NCCLCHECK(cmd) \ + do { \ + ncclResult_t r = cmd; \ + if (r != ncclSuccess) { \ + printf("Failed, NCCL error %s:%d '%s'\n", __FILE__, __LINE__, \ + platform::dynload::ncclGetErrorString(r)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +// NOTE(shenliang03): EventManager are movable not copyable CudaEvent wrapper. +// EventManage is different from paddle::platform::CudaEvent. +// It uses lazy initialization and is only created when the +// Record() method is called for the first time; it also monitors +// device information to ensure that recorded stream and event +// are on the same device. + +class EventManager { + public: + EventManager() {} + explicit EventManager(unsigned int flags) : flags_{flags} {} + + ~EventManager() { + if (is_created_) { + platform::CUDADeviceGuard guard(device_index_); + cudaEventDestroy(event_); + } + } + + EventManager(const EventManager&) = delete; + EventManager& operator=(const EventManager&) = delete; + + EventManager(EventManager&& other) { + std::swap(flags_, other.flags_); + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + } + + EventManager& operator=(EventManager&& other) { + std::swap(flags_, other.flags_); + std::swap(is_created_, other.is_created_); + std::swap(device_index_, other.device_index_); + std::swap(event_, other.event_); + return *this; + } + + bool IsCreated() const { return is_created_; } + bool DeviceId() const { return device_index_; } + gpuEvent_t GetRawCudaEvent() const { return event_; } + + void Record(const paddle::platform::CUDADeviceContext& ctx) { + auto device_index = ctx.GetPlace().device; + if (!is_created_) { + CreateEvent(device_index); + } + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + + platform::CUDADeviceGuard guard(device_index_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventRecord(event_, ctx.stream())); + } + + bool Query() const { + gpuError_t err = cudaEventQuery(event_); + if (err == cudaSuccess) { + return true; + } else if (err == cudaErrorNotReady) { + return false; + } else { + PADDLE_ENFORCE_GPU_SUCCESS(err); + return false; + } + } + + void Synchronize() const { + if (is_created_) { + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventSynchronize(event_)); + } + } + + void Block(const paddle::platform::CUDADeviceContext& ctx) const { + if (is_created_) { + auto device_index = ctx.GetPlace().device; + PADDLE_ENFORCE_EQ(device_index, device_index_, + platform::errors::PreconditionNotMet( + "CUDADeviceContext's device %d does not match" + "Event's device %d", + device_index, device_index_)); + platform::CUDADeviceGuard guard(device_index_); + PADDLE_ENFORCE_GPU_SUCCESS(cudaStreamWaitEvent(ctx.stream(), event_, 0)); + } + } + + private: + unsigned int flags_ = cudaEventDefault; + bool is_created_{false}; + gpuEvent_t event_{}; + int8_t device_index_{0}; + + private: + void CreateEvent(int device_index) { + device_index_ = device_index; + platform::CUDADeviceGuard guard(device_index); + PADDLE_ENFORCE_GPU_SUCCESS(cudaEventCreateWithFlags(&event_, flags_)); + is_created_ = true; + } +}; + +// NOTE(shenliang03): NCCLCommManager is more lightweight than +// platform::NCCLComm + +class NCCLCommManager { + public: + explicit NCCLCommManager(ncclComm_t ncclComm) : nccl_comm_(ncclComm) {} + + NCCLCommManager() : NCCLCommManager(nullptr) {} + + ~NCCLCommManager() noexcept { + std::unique_lock lock(mutex_); + if (nccl_comm_) { + platform::dynload::ncclCommDestroy(nccl_comm_); + } + } + + static std::shared_ptr Create(int num_ranks, int rank, + ncclUniqueId comm_id) { + auto nccl_manager = std::make_shared(); + NCCLCHECK(platform::dynload::ncclCommInitRank(&(nccl_manager->nccl_comm_), + num_ranks, comm_id, rank)); + + nccl_manager->nccl_id_ = comm_id; + nccl_manager->rank_ = rank; + return nccl_manager; + } + + ncclUniqueId GetNcclId() const { + std::unique_lock lock(mutex_); + return nccl_id_; + } + + ncclComm_t GetNcclComm() const { + std::unique_lock lock(mutex_); + return nccl_comm_; + } + + NCCLCommManager(const NCCLCommManager&) = delete; + NCCLCommManager& operator=(const NCCLCommManager&) = delete; + NCCLCommManager& operator=(NCCLCommManager&& other) = delete; + + NCCLCommManager(NCCLCommManager&& other) { + std::unique_lock lock(other.mutex_); + std::swap(nccl_comm_, other.nccl_comm_); + } + + protected: + ncclComm_t nccl_comm_; + ncclUniqueId nccl_id_; + int rank_; + mutable std::mutex mutex_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.cc b/paddle/fluid/distributed/collective/ProcessGroup.cc new file mode 100644 index 0000000000000..42ca3bd5f5be4 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroup.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" + +namespace paddle { +namespace distributed { + +ProcessGroup::Task::Task(int rank, const std::vector& inputTensors, + CommType comm_type) + : rank_(rank), comm_type_(comm_type) {} + +ProcessGroup::Task::~Task() = default; + +bool ProcessGroup::Task::IsCompleted() { + std::lock_guard lock(mutex_); + return is_completed_; +} + +bool ProcessGroup::Task::Wait(std::chrono::milliseconds timeout) { + return false; +} + +void ProcessGroup::Task::Synchronize() {} + +ProcessGroup::ProcessGroup(int rank, int size) : rank_(rank), size_(size) {} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h new file mode 100644 index 0000000000000..dde8622d9007e --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroup.h @@ -0,0 +1,108 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/eager/api/utils/tensor_utils.h" + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/enforce.h" + +constexpr auto kWaitTimeout = std::chrono::milliseconds(0); + +namespace paddle { +namespace distributed { + +using Tensor = paddle::experimental::Tensor; + +enum class CommType : std::uint8_t { + BROADCAST = 0, + ALLREDUCE = 1, + ALLREDUCE_SPARSE = 2, // TODO(shenliang03): to support sparse in allreduce + REDUCE = 3, + ALLGATHER = 4, + GATHER = 5, + SCATTER = 6, + REDUCE_SCATTER = 7, + ALLTOALL = 8, + SEND = 9, + RECV = 10, + BARRIER = 11, + UNKNOWN = 100, +}; + +struct ProcessGroupStrategy { + int nranks_{1}; + int local_rank_{0}; + std::vector trainer_endpoints_{}; + std::string current_endpoint_{""}; + int nrings_{1}; +}; + +class ProcessGroup { + public: + class Task { + public: + Task(int rank, const std::vector& inputTensors, + CommType opType = CommType::UNKNOWN); + + virtual ~Task(); + virtual bool IsCompleted(); + virtual bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + virtual void Synchronize(); + + protected: + const int rank_; + CommType comm_type_; + std::mutex mutex_; + bool is_completed_ = false; + }; + + explicit ProcessGroup(int rank, int size); + virtual ~ProcessGroup() {} + + int GetRank() const { return rank_; } + + int GetSize() const { return size_; } + + virtual const std::string GetBackendName() const = 0; + + virtual std::shared_ptr AllReduce( + std::vector& /* tensors */, + const AllreduceOptions& = AllreduceOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support allreduce", GetBackendName())); + } + + virtual std::shared_ptr Broadcast( + std::vector& /* tensors */, + const BroadcastOptions& = BroadcastOptions()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "ProcessGroup%s does not support allreduce", GetBackendName())); + } + + protected: + const int rank_; + const int size_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc new file mode 100644 index 0000000000000..fe2325423b460 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc @@ -0,0 +1,321 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#include "paddle/fluid/platform/device/gpu/nccl_helper.h" + +DECLARE_bool(nccl_blocking_wait); +DECLARE_bool(use_stream_safe_cuda_allocator); + +constexpr int64_t kWaitBlockTImeout = 10; + +namespace paddle { +namespace distributed { + +static ncclRedOp_t ToNCCLRedType(ReduceOp reduction) { + static const std::map red_type = { + {ReduceOp::MIN, ncclMin}, + {ReduceOp::MAX, ncclMax}, + {ReduceOp::SUM, ncclSum}, + {ReduceOp::PRODUCT, ncclProd}, + }; + auto it = red_type.find(reduction); + PADDLE_ENFORCE_EQ(it != red_type.end(), true, + platform::errors::InvalidArgument( + "Invalid nccl reduction. Must be ncclMin | ncclMax | " + "ncclProd | ncclSum")); + return it->second; +} + +std::string SerializeNCCLUniqueId(const ncclUniqueId& ncclID) { + const uint8_t* bytes = reinterpret_cast(&ncclID); + std::ostringstream oss; + for (auto i = 0; i < NCCL_UNIQUE_ID_BYTES; ++i) { + oss << std::hex << static_cast(bytes[i]); + } + return oss.str(); +} + +// Get the list of devices from list of tensors +std::vector GetPlaceList(const std::vector& tensors) { + std::vector places; + places.reserve(tensors.size()); + for (auto& tensor : tensors) { + places.push_back(tensor.inner_place()); + } + return places; +} + +// Get the deviceList String from the list of devices +std::string GetKeyFromPlaces(const std::vector& places) { + std::string placeList; + for (auto& place : places) { + std::stringstream tmp; + tmp << place; + if (placeList.empty()) { + placeList += tmp.str(); + } else { + placeList += "," + tmp.str(); + } + } + return placeList; +} + +bool CheckTensorsInCudaPlace(const std::vector& tensors) { + return std::all_of(tensors.cbegin(), tensors.cend(), [&](const Tensor& t) { + return t.place() == PlaceType::kGPU; + }); +} + +void SyncDefaultStream( + const std::vector& places, + std::vector& ncclEvents, // NOLINT + std::vector>& dev_ctx) { // NOLINT + for (size_t i = 0; i < places.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places[i])); + ncclEvents[i].Record(*dev_ctx[i]); + ncclEvents[i].Block(*default_ctx); + } +} + +std::shared_ptr ProcessGroupNCCL::CreateTask( + std::vector places, int rank, CommType comm_type, + const std::vector& inputs) { + return std::make_shared(places, rank, comm_type, + inputs); +} + +ProcessGroupNCCL::NCCLTask::NCCLTask(const std::vector& places, int rank, + CommType CommType, + const std::vector& inputs) + : Task(rank, inputs, CommType), places_(places) { + control_events_.resize(places.size()); + ncclComms_.resize(places.size()); +} + +ProcessGroupNCCL::NCCLTask::~NCCLTask() {} + +void ProcessGroupNCCL::NCCLTask::SetOutputs( + std::vector& outputs) { // NOLINT + outputs_ = std::make_shared>(outputs); +} + +void ProcessGroupNCCL::NCCLTask::SynchronizeStreams() { + for (size_t i = 0; i < places_.size(); ++i) { + auto* default_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(places_[i])); + default_ctx->WaitEvent(control_events_[i].GetRawCudaEvent()); + } +} + +bool ProcessGroupNCCL::NCCLTask::IsCompleted() { + for (size_t i = 0; i < places_.size(); ++i) { + if (!control_events_[i].Query()) { + return false; + } + } + + return true; +} + +// TODO(sheniang03): Add timeout for wait, now timeout unused +bool ProcessGroupNCCL::NCCLTask::Wait(std::chrono::milliseconds timeout) { + SynchronizeStreams(); + if (FLAGS_nccl_blocking_wait) { + // NOTE(shenliang03): It will block host for sync + while (!IsCompleted()) { + std::this_thread::sleep_for(std::chrono::milliseconds(kWaitBlockTImeout)); + } + } + return true; +} + +// Same as Wait +void ProcessGroupNCCL::NCCLTask::Synchronize() { Wait(kWaitTimeout); } + +ProcessGroupNCCL::ProcessGroupNCCL(const ProcessGroupStrategy& strategy, + int rank, int size) + : ProcessGroup(rank, size), strategy_(strategy) {} + +void ProcessGroupNCCL::BcastNCCLId( + std::vector& nccl_ids, // NOLINT + int root, int server_fd) { + if (strategy_.local_rank_ == root) { + std::vector other_trainers; + for (auto& ep : strategy_.trainer_endpoints_) { + if (ep != strategy_.current_endpoint_) { + other_trainers.push_back(ep); + } + } + platform::SendBroadCastCommID(other_trainers, &nccl_ids); + } else { + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &nccl_ids); + } +} + +void ProcessGroupNCCL::BroadcastUniqueNCCLID( + std::vector& nccl_ids) { // NOLINT + + int server_fd = -1; + if (rank_ != 0) { + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); + } + BcastNCCLId(nccl_ids, 0, server_fd); +} + +// create NCCLManager cache for places_key +void ProcessGroupNCCL::CreateNCCLManagerCache( + const std::string& places_key, const std::vector& places) { + PADDLE_ENFORCE_EQ(places_key.empty(), false, + platform::errors::PreconditionNotMet( + "Not able to create/get the NCCL Communicator since " + "the GPU place are not known")); + + std::vector> nccl_comms; + nccl_comms.resize(places.size()); + + // using vector just for broadcast + std::vector nccl_ids; + nccl_ids.resize(1); + auto& nccl_id = nccl_ids.front(); + + if (rank_ == 0) { + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGetUniqueId(&nccl_id)); + } + BroadcastUniqueNCCLID(nccl_ids); + + VLOG(3) << "init nccl rank: " << strategy_.local_rank_ + << ", nranks: " << strategy_.nranks_ << ", place: " << places_key + << ", nccl uniqueid: " << SerializeNCCLUniqueId(nccl_id); + + std::vector> dev_ctx; + dev_ctx.resize(places.size()); + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart()); + + for (size_t i = 0; i < places.size(); ++i) { + platform::CUDADeviceGuard guard(places[i]); + nccl_comms[i] = NCCLCommManager::Create(GetSize(), GetRank(), nccl_id); + dev_ctx[i].reset(new CUDADeviceContext(places[i])); + } + + PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd()); + + std::vector events; + events.resize(places.size()); + + // These caches will be useful to process sync/wait/communicate + places_to_events_.emplace(places_key, std::move(events)); + places_to_ncclcomm_.emplace(places_key, std::move(nccl_comms)); + places_to_ctx_.emplace(places_key, std::move(dev_ctx)); +} + +template +std::shared_ptr ProcessGroupNCCL::Collective( + std::vector& inputs, std::vector& outputs, Fn fn, + CommType op_type) { + const auto places = GetPlaceList(inputs); + const auto key = GetKeyFromPlaces(places); + + { + std::lock_guard lock(mutex_); + if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) { + CreateNCCLManagerCache(key, places); + } + } + + auto& nccl_comms = places_to_ncclcomm_[key]; + + SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]); + + auto task = CreateTask(places, rank_, op_type, inputs); + task->SetOutputs(outputs); + + // construct uninitialize guard for device + platform::CUDADeviceGuard cuda_guard; + + if (FLAGS_use_stream_safe_cuda_allocator) { + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + auto dense_tensor = + std::dynamic_pointer_cast(inputs[i].impl()); + memory::RecordStream(dense_tensor->Holder(), + places_to_ctx_[key][i]->stream()); + } + } + + { + platform::NCCLGroupGuard nccl_guard; + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + const auto& nccl_stream = places_to_ctx_[key][i]->stream(); + fn(inputs[i], outputs[i], nccl_comms[i]->GetNcclComm(), nccl_stream); + } + } + + for (size_t i = 0; i < inputs.size(); ++i) { + cuda_guard.SetDevice(places[i]); + task->control_events_[i].Record(*places_to_ctx_[key][i]); + } + return task; +} + +std::shared_ptr ProcessGroupNCCL::AllReduce( + std::vector& tensors, const AllreduceOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + return Collective( + tensors, tensors, + [&](const Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclAllReduce( + input_tensor->data(), output_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), + ToNCCLRedType(opts.reduce_op), comm, stream); + }, + CommType::ALLREDUCE); +} + +std::shared_ptr ProcessGroupNCCL::Broadcast( + std::vector& tensors, const BroadcastOptions& opts) { + PADDLE_ENFORCE_EQ( + CheckTensorsInCudaPlace(tensors), true, + platform::errors::InvalidArgument("All inputs should be in CudaPlace.")); + + return Collective( + tensors, tensors, + [&](Tensor& input, Tensor& output, ncclComm_t comm, + const gpuStream_t& stream) { + const auto root = opts.source_rank * tensors.size() + opts.source_root; + auto input_tensor = + std::dynamic_pointer_cast(input.impl()); + auto output_tensor = + std::dynamic_pointer_cast(output.impl()); + return platform::dynload::ncclBcast( + input_tensor->data(), input_tensor->numel(), + platform::ToNCCLDataType(input.type()), root, comm, stream); + }, + CommType::BROADCAST); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h new file mode 100644 index 0000000000000..9f06566d1c863 --- /dev/null +++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h @@ -0,0 +1,126 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/platform/cuda_device_guard.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/distributed/collective/NCCLTools.h" +#include "paddle/fluid/platform/dynload/nccl.h" +#endif + +constexpr const char* NCCL_BACKEND_NAME = "NCCL"; + +namespace paddle { +namespace distributed { + +using Place = paddle::platform::Place; +using CUDAStream = platform::stream::CUDAStream; +using CUDADeviceContext = paddle::platform::CUDADeviceContext; + +class ProcessGroupNCCL : public ProcessGroup { + public: + class NCCLTask : public ProcessGroup::Task, + public std::enable_shared_from_this { + public: + NCCLTask(const std::vector& places, int rank, CommType CommType, + const std::vector& inputs); + + bool IsCompleted(); + + void SynchronizeStreams(); + + bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); + + void Synchronize(); + + void SetOutputs(std::vector& outputs); // NOLINT + + virtual ~NCCLTask(); + + std::vector control_events_; + + protected: + std::vector places_; + std::vector> ncclComms_; + std::shared_ptr> outputs_; + + private: + }; + + ProcessGroupNCCL(const ProcessGroupStrategy& strategy, int rank, int size); + + const std::string GetBackendName() const override { + return std::string(NCCL_BACKEND_NAME); + } + + std::shared_ptr AllReduce( + std::vector& tensors, + const AllreduceOptions& = AllreduceOptions()) override; + + std::shared_ptr Broadcast( + std::vector& tensors, + const BroadcastOptions& = BroadcastOptions()) override; + + protected: + virtual std::shared_ptr CreateTask( + std::vector places, int rank, CommType opType, + const std::vector& inputs); + + protected: + ProcessGroupStrategy strategy_; + std::shared_ptr nccl_comm_; + std::mutex mutex_; + std::unordered_map>> + places_to_ncclcomm_; + + std::unordered_map> places_to_events_; + + std::unordered_map>> + places_to_ctx_; + + private: + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT + int server_fd); + + void BroadcastUniqueNCCLID(std::vector& nccl_ids); // NOLINT + + template + std::shared_ptr Collective( + std::vector& inputs, // NOLINT + std::vector& outputs, // NOLINT + Fn fn, CommType op_type); + + void CreateNCCLManagerCache(const std::string& places_key, + const std::vector& places); +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/collective/Types.h b/paddle/fluid/distributed/collective/Types.h new file mode 100644 index 0000000000000..654d06686957b --- /dev/null +++ b/paddle/fluid/distributed/collective/Types.h @@ -0,0 +1,36 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +namespace paddle { +namespace distributed { + +// TODO(shenliang03): To support AVG for reduce +enum class ReduceOp : std::uint8_t { SUM = 0, AVG, MAX, MIN, PRODUCT }; + +struct AllreduceOptions { + ReduceOp reduce_op = ReduceOp::SUM; +}; + +struct BroadcastOptions { + int source_rank = 0; + int source_root = 0; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_device_guard.h b/paddle/fluid/platform/cuda_device_guard.h index 40204c0ed83f9..08beed532a7ec 100644 --- a/paddle/fluid/platform/cuda_device_guard.h +++ b/paddle/fluid/platform/cuda_device_guard.h @@ -14,13 +14,28 @@ #pragma once #include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace platform { class CUDADeviceGuard { public: - explicit inline CUDADeviceGuard(int dev_id) { + explicit CUDADeviceGuard(int dev_id) { SetDeviceIndex(dev_id); } + + explicit CUDADeviceGuard(const CUDAPlace& place) + : CUDADeviceGuard(place.device) {} + + // create uninitialized CUDADeviceGuard + CUDADeviceGuard() {} + + ~CUDADeviceGuard() { + if (prev_id_ != -1) { + platform::SetDeviceId(prev_id_); + } + } + + inline void SetDeviceIndex(const int dev_id) { int prev_id = platform::GetCurrentDeviceId(); if (prev_id != dev_id) { prev_id_ = prev_id; @@ -28,10 +43,9 @@ class CUDADeviceGuard { } } - inline ~CUDADeviceGuard() { - if (prev_id_ != -1) { - platform::SetDeviceId(prev_id_); - } + void SetDevice(const CUDAPlace& place) { + int dev_id = place.device; + SetDeviceIndex(dev_id); } CUDADeviceGuard(const CUDADeviceGuard& o) = delete; diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h index 1d6ccdc1280a9..1919f59f8c07f 100644 --- a/paddle/fluid/platform/device/gpu/nccl_helper.h +++ b/paddle/fluid/platform/device/gpu/nccl_helper.h @@ -56,6 +56,23 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) { } } +inline ncclDataType_t ToNCCLDataType(experimental::DataType type) { + if (type == experimental::DataType::FLOAT32) { + return ncclFloat; + } else if (type == experimental::DataType::FLOAT64) { + return ncclDouble; + } else if (type == experimental::DataType::INT32) { + return ncclInt; + } else if (type == experimental::DataType::INT64) { + return ncclInt64; + } else if (type == experimental::DataType::FLOAT16) { + return ncclFloat16; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in nccl is not supported.")); + } +} + // NOTE(minqiyang): according to the ncclGroupEnd documentations: // https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, // ncclGroupEnd will wait for all communicators to be initialized, which will diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 39f95a9295661..baf043e860be4 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -761,3 +761,15 @@ DEFINE_bool(enable_slotrecord_reset_shrink, false, "enable slotrecord obejct reset shrink memory, default false"); DEFINE_bool(enable_ins_parser_file, false, "enable parser ins file , default false"); + +/** + * ProcessGroupNCCL related FLAG + * Name: nccl_blocking_wait + * Since Version: + * Value Range: bool, default=false + * Example: + * Note: nccl blocking wait. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +PADDLE_DEFINE_EXPORTED_bool(nccl_blocking_wait, false, "nccl blocking wait"); +#endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 01b21d02ea017..e76183192bcee 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -80,6 +80,14 @@ set(PYBIND_SRCS communication.cc cuda_streams_py.cc) +if(NOT ON_INFER) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup) + if (WITH_NCCL) + set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl) + endif() + set(PYBIND_SRCS ${PYBIND_SRCS} distributed_py.cc) +endif() + if(WITH_ASCEND) set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper) set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc) diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc new file mode 100644 index 0000000000000..e057fb53ccecc --- /dev/null +++ b/paddle/fluid/pybind/distributed_py.cc @@ -0,0 +1,149 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#ifdef _POSIX_C_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#ifdef _XOPEN_SOURCE +#undef _XOPEN_SOURCE +#endif + +#include "paddle/fluid/distributed/collective/ProcessGroup.h" +#include "paddle/fluid/distributed/collective/Types.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/imperative/layer.h" +#include "paddle/fluid/pybind/distributed_py.h" +#include "paddle/fluid/pybind/eager_utils.h" +#include "paddle/phi/api/all.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h" +#endif + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +using Tensor = paddle::experimental::Tensor; + +void BindDistributed(py::module *m) { + py::enum_(*m, "ReduceOp") + .value("SUM", distributed::ReduceOp::SUM) + .value("AVG", distributed::ReduceOp::AVG) + .value("MAX", distributed::ReduceOp::MAX) + .value("MIN", distributed::ReduceOp::MIN) + .value("PRODUCT", distributed::ReduceOp::PRODUCT); + + py::class_(*m, "AllreduceOptions") + .def(py::init<>()) + .def_readwrite("reduce_op", &distributed::AllreduceOptions::reduce_op); + + py::class_(*m, "BroadcastOptions") + .def(py::init<>()) + .def_readwrite("source_rank", &distributed::BroadcastOptions::source_rank) + .def_readwrite("source_root", + &distributed::BroadcastOptions::source_root); + + auto ProcessGroup = + py::class_>(*m, "ProcessGroup") + .def("rank", &distributed::ProcessGroup::GetRank) + .def("size", &distributed::ProcessGroup::GetSize) + .def("name", &distributed::ProcessGroup::GetBackendName) + .def("allreduce", + [](distributed::ProcessGroup &self, py::handle py_tensor, + distributed::ReduceOp op) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::AllreduceOptions opts; + opts.reduce_op = op; + std::vector tensors = {tensor}; + return self.AllReduce(tensors, opts); + }, + py::arg("tensor"), py::arg("op") = distributed::ReduceOp::SUM, + py::call_guard()) + + .def("broadcast", + [](distributed::ProcessGroup &self, py::handle py_tensor, + int source_rank) { + auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0); + distributed::BroadcastOptions opts; + opts.source_rank = source_rank; + std::vector tensors = {tensor}; + return self.Broadcast(tensors, opts); + }, + py::arg("tensor"), py::arg("source_rank"), + py::call_guard()); + +#if defined(PADDLE_WITH_NCCL) + py::class_>( + *m, "ProcessGroupNCCL", ProcessGroup) + .def(py::init(), + py::call_guard()); + + py::class_>(*m, "task") + .def("is_completed", &distributed::ProcessGroup::Task::IsCompleted) + .def("wait", &distributed::ProcessGroup::Task::Wait, + py::arg("timeout") = kWaitTimeout, + py::call_guard()) + .def("synchronize", &distributed::ProcessGroup::Task::Synchronize, + py::call_guard()); +#endif + + // define parallel strategy, it will be removed + py::class_ pg_strategy( + *m, "ProcessGroupStrategy", ""); + pg_strategy.def(py::init()) + .def_property("nranks", + [](const distributed::ProcessGroupStrategy &self) { + return self.nranks_; + }, + [](distributed::ProcessGroupStrategy &self, int nranks) { + self.nranks_ = nranks; + }) + .def_property("local_rank", + [](const distributed::ProcessGroupStrategy &self) { + return self.local_rank_; + }, + [](distributed::ProcessGroupStrategy &self, + int local_rank) { self.local_rank_ = local_rank; }) + .def_property( + "trainer_endpoints", + [](const distributed::ProcessGroupStrategy &self) { + return self.trainer_endpoints_; + }, + [](distributed::ProcessGroupStrategy &self, + std::vector eps) { self.trainer_endpoints_ = eps; }) + .def_property("current_endpoint", + [](const distributed::ProcessGroupStrategy &self) { + return self.current_endpoint_; + }, + [](distributed::ProcessGroupStrategy &self, + const std::string &ep) { self.current_endpoint_ = ep; }) + .def_property("nrings", + [](const distributed::ProcessGroupStrategy &self) { + return self.nrings_; + }, + [](distributed::ProcessGroupStrategy &self, int nrings) { + self.nrings_ = nrings; + }); +} + +} // end namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/distributed_py.h b/paddle/fluid/pybind/distributed_py.h new file mode 100644 index 0000000000000..be5c7549b8e8d --- /dev/null +++ b/paddle/fluid/pybind/distributed_py.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pybind11/chrono.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace py = pybind11; + +namespace paddle { +namespace pybind { + +void BindDistributed(py::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 58205041b8041..958174420570e 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -78,6 +78,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/pybind/cuda_streams_py.h" +#include "paddle/fluid/pybind/distributed_py.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/lod_utils.h" #ifndef PADDLE_ON_INFERENCE @@ -3895,6 +3896,9 @@ All parameter, weight, gradient are variables in Paddle. BindCompatible(&m); BindDataset(&m); BindGenerator(&m); +#ifndef PADDLE_ON_INFERENCE + BindDistributed(&m); +#endif #ifdef PADDLE_WITH_ASCEND BindAscendWrapper(&m); BindAscendGraph(&m); diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 15ddcf588441e..ad0a81e725707 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -54,6 +54,7 @@ list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) list(APPEND DIST_TEST_OPS test_auto_parallel_data_unshard) list(APPEND DIST_TEST_OPS test_auto_parallel_save_load) list(APPEND DIST_TEST_OPS test_auto_parallel_autoconvert) +list(APPEND DIST_TEST_OPS test_collective_process_group) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -290,6 +291,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_data_unshard) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_save_load) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_autoconvert) + LIST(REMOVE_ITEM TEST_OPS test_collective_process_group) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -1114,6 +1116,8 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_auto_parallel_data_unshard PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_save_load PROPERTIES TIMEOUT 120) set_tests_properties(test_auto_parallel_autoconvert PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_process_group PROPERTIES TIMEOUT 120) + if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/process_group_nccl.py b/python/paddle/fluid/tests/unittests/process_group_nccl.py new file mode 100644 index 0000000000000..d999aad63ecf4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/process_group_nccl.py @@ -0,0 +1,149 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import random +import numpy as np +import os +import shutil + +import paddle +from paddle.fluid import core +from datetime import timedelta +import paddle.fluid.core as core +from paddle.fluid.framework import _test_eager_guard +from paddle.fluid.dygraph.parallel import ParallelEnv + +ProcessGroupStrategy = core.ProcessGroupStrategy + + +def init_process_group(strategy=None): + # this will remove + if strategy is None: + strategy = ProcessGroupStrategy() + strategy.nranks = ParallelEnv().nranks + strategy.local_rank = ParallelEnv().local_rank + strategy.trainer_endpoints = ParallelEnv().trainer_endpoints + strategy.current_endpoint = ParallelEnv().current_endpoint + if strategy.nranks < 2: + return + + pg_group = core.ProcessGroupNCCL(strategy, strategy.local_rank, + strategy.nranks) + + return pg_group + + +class TestProcessGroupFp32(unittest.TestCase): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float32" + self.shape = (2, 10, 5) + + def test_create_process_group_nccl(self): + with _test_eager_guard(): + paddle.set_device('gpu:%d' % + paddle.distributed.ParallelEnv().dev_id) + + pg = init_process_group() + print("rank:", pg.rank(), "size:", pg.size(), "name:", pg.name()) + print("test new group api ok") + + # test allreduce sum + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + sum_result = tensor_x + tensor_y + if pg.rank() == 0: + task = pg.allreduce(tensor_x) + task.wait() + assert np.array_equal(tensor_x, sum_result) + else: + task = pg.allreduce(tensor_y) + task.wait() + assert np.array_equal(tensor_y, sum_result) + + print("test allreduce sum api ok") + + # test allreduce max + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + max_result = paddle.maximum(tensor_x, tensor_y) + + if pg.rank() == 0: + task = pg.allreduce(tensor_x, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_x, max_result) + else: + task = pg.allreduce(tensor_y, core.ReduceOp.MAX) + task.wait() + assert np.array_equal(tensor_y, max_result) + + print("test allreduce max api ok") + + # test broadcast + # rank 0 + x = np.random.random(self.shape).astype(self.dtype) + tensor_x = paddle.to_tensor(x) + # rank 1 + y = np.random.random(self.shape).astype(self.dtype) + tensor_y = paddle.to_tensor(y) + + broadcast_result = paddle.assign(tensor_x) + if pg.rank() == 0: + task = pg.broadcast(tensor_x, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_x) + else: + task = pg.broadcast(tensor_y, 0) + task.synchronize() + paddle.device.cuda.synchronize() + assert task.is_completed() + assert np.array_equal(broadcast_result, tensor_y) + + print("test broadcast api ok") + + +class TestProcessGroupFp16(TestProcessGroupFp32): + def setUp(self): + paddle.seed(2022) + random.seed(2022) + np.random.seed(2022) + self.config() + + def config(self): + self.dtype = "float16" + self.shape = (4, 20, 20) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_collective_process_group.py b/python/paddle/fluid/tests/unittests/test_collective_process_group.py new file mode 100644 index 0000000000000..6ae5424a882da --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_collective_process_group.py @@ -0,0 +1,27 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestProcessGroup(TestMultipleGpus): + def test_process_group_nccl(self): + self.run_mnist_2gpu('process_group_nccl.py') + + +if __name__ == "__main__": + unittest.main() From 2457a7d1b6d2e54de2e2f2ca8c997b38b2957027 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Wed, 23 Feb 2022 15:14:27 +0100 Subject: [PATCH 11/85] added paddle_bfloat to requirements (#39740) --- .../test_python_bf16_numpy_datatype.py | 34 +++++++++++++++++++ python/requirements.txt | 1 + 2 files changed, 35 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py diff --git a/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py new file mode 100644 index 0000000000000..a58d7d35807c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_python_bf16_numpy_datatype.py @@ -0,0 +1,34 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +from paddle_bfloat import bfloat16 +import unittest + + +class TestBF16DataType(unittest.TestCase): + def test_matmul(self): + a_bf16 = np.random.random((6, 7)).astype(bfloat16) + b_bf16 = np.random.random((7, 8)).astype(bfloat16) + c_bf16 = np.matmul(a_bf16, b_bf16) + + a_fp32 = a_bf16.astype(np.float32) + b_fp32 = b_bf16.astype(np.float32) + c_fp32 = np.matmul(a_fp32, b_fp32) + + self.assertTrue(np.allclose(c_bf16, c_fp32)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/requirements.txt b/python/requirements.txt index f2a4580a94e51..5f2b788a81a0a 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -5,3 +5,4 @@ Pillow six decorator astor +paddle_bfloat==0.1.2 From 76a6b88d233aae2c7f6804f3c6aeb19e903dd54a Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 24 Feb 2022 09:52:33 +0800 Subject: [PATCH 12/85] [PHi] Skip kernel declare for cuda only kernel on rocm (#39869) * skip kernel declare for cuda only kernel on rocm * fix error --- cmake/pten.cmake | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/cmake/pten.cmake b/cmake/pten.cmake index 9a3552efce8e1..5645ac6cfa303 100644 --- a/cmake/pten.cmake +++ b/cmake/pten.cmake @@ -58,15 +58,21 @@ endfunction() function(kernel_declare TARGET_LIST) foreach(kernel_path ${TARGET_LIST}) file(READ ${kernel_path} kernel_impl) - # TODO(chenweihang): rename PD_REGISTER_KERNEL to PD_REGISTER_KERNEL - # NOTE(chenweihang): now we don't recommend to use digit in kernel name - string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*," first_registry "${kernel_impl}") + string(REGEX MATCH "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*" first_registry "${kernel_impl}") if (NOT first_registry STREQUAL "") + # some gpu kernel only can run on cuda, not support rocm, so we add this branch + if (WITH_ROCM) + string(FIND "${first_registry}" "cuda_only" pos) + if(pos GREATER 1) + continue() + endif() + endif() # parse the first kernel name string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}") string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name "${kernel_name}") string(REPLACE "," "" kernel_name "${kernel_name}") string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}") + string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}") # append kernel declare into declarations.h # TODO(chenweihang): default declare ALL_LAYOUT for each kernel if (${kernel_path} MATCHES "./cpu\/") From d6038c22696e23dfc181643694e84f888e8001ae Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Thu, 24 Feb 2022 10:21:33 +0800 Subject: [PATCH 13/85] optimize performance of lookup_table_v2_op (#39856) * optimize block config and fp16 atomicAdd perf for lookup_table_v2_grad. --- paddle/fluid/operators/lookup_table_v2_op.cu | 45 ++++++---- .../platform/device/gpu/gpu_primitives.h | 88 +++++++++++++++++++ 2 files changed, 115 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 4539f7091b578..d40b264378570 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -21,19 +21,18 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, const int64_t N, const int64_t K, const int64_t D, const int64_t padding_idx) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); T *out = output + idy * D; const T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { + for (int i = idx; i < D; i += blockDim.x) { if (PaddingFlag) { if (id == padding_idx) out[i] = static_cast(0); @@ -43,25 +42,29 @@ __global__ void LookupTableV2(T *output, const T *table, const IdT *ids, out[i] = tab[i]; } } - idy += BlockDimY * GridDimX; + idy += blockDim.y * gridDim.x; } } -template +template __global__ void LookupTableV2Grad(T *table, const T *output, const IdT *ids, const int64_t N, const int64_t K, const int64_t D) { int idx = threadIdx.x; - int idy = blockIdx.x + threadIdx.y * GridDimX; + int idy = blockIdx.x + threadIdx.y * gridDim.x; while (idy < K) { auto id = static_cast(ids[idy]); const T *out = output + idy * D; T *tab = table + id * D; - for (int i = idx; i < D; i += BlockDimX) { +#ifdef PADDLE_WITH_CUDA + paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); +#else + for (int i = idx; i < D; i += blockDim.x) { paddle::platform::CudaAtomicAdd(&tab[i], out[i]); } - idy += BlockDimY * GridDimX; +#endif + idy += blockDim.y * gridDim.x; } } @@ -81,8 +84,9 @@ struct LookupTableV2CUDAFunctor { size_t D = table_t->dims()[1]; size_t K = ids_t_->numel(); + const int gridx = 2 * context_.cuda_device_context().GetSMCount(); dim3 threads(256, 4); - dim3 grids(80, 1); + dim3 grids(gridx, 1); const auto *table = table_t->template data(); const auto *ids = ids_t_->template data(); @@ -90,10 +94,10 @@ struct LookupTableV2CUDAFunctor { auto stream = context_.cuda_device_context().stream(); if (padding_idx == -1) { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } else { - LookupTableV2<<>>( + LookupTableV2<<>>( output, table, ids, N, K, D, padding_idx); } } @@ -193,17 +197,22 @@ struct LookupTableV2GradCUDAFunctor { int D = d_table_t->dims()[1]; int K = ids_t_->numel(); - dim3 threads(128, 8); - dim3 grids(8, 1); const T *d_output = d_output_t->template data(); const auto *ids = ids_t_->template data(); T *d_table = d_table_t->mutable_data(context_.GetPlace()); - auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS( + hipMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + cudaMemsetAsync(d_table, 0, N * D * sizeof(T), dev_ctx.stream())); +#endif - LookupTableV2Grad<<>>( + const int gridx = 2 * dev_ctx.GetSMCount(); + dim3 threads(128, 8); + dim3 grids(gridx, 1); + LookupTableV2Grad<<>>( d_table, d_output, ids, N, K, D); } } diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 3e070da546b2a..8616e969f69df 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -147,6 +147,94 @@ CUDA_ATOMIC_WRAPPER(Add, float16) { } } #endif + +// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" +// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *tensor, size_t index, + const size_t numel, T value) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + CudaAtomicAdd(reinterpret_cast(tensor) + index, + static_cast(value)); +#else + // whether the address is 32-byte aligned. + __half *target_addr = reinterpret_cast<__half *>(tensor + index); + bool aligned_half2 = + (reinterpret_cast(target_addr) % sizeof(__half2) == 0); + + if (aligned_half2 && index < (numel - 1)) { + __half2 value2; + value2.x = *reinterpret_cast<__half *>(&value); + value2.y = __int2half_rz(0); + atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); + + } else if (!aligned_half2 && index > 0) { + __half2 value2; + value2.x = __int2half_rz(0); + value2.y = *reinterpret_cast<__half *>(&value); + atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); + + } else { + atomicAdd(reinterpret_cast<__half *>(tensor) + index, + *reinterpret_cast<__half *>(&value)); + } +#endif +} + +template ::value>::type * = nullptr> +__device__ __forceinline__ void fastAtomicAdd(T *arr, size_t index, + const size_t numel, T value) { + CudaAtomicAdd(arr + index, value); +} + +#ifdef PADDLE_WITH_CUDA +/* + * One thead block deals with elementwise atomicAdd for vector of len. + * @in: [x1, x2, x3, ...] + * @out:[y1+x1, y2+x2, y3+x3, ...] + * */ +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +} + +// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. +template ::value>::type * = nullptr> +__device__ __forceinline__ void VectorizedAtomicAddPerBlock( + const int64_t len, int tid, int threads_per_block, const T *in, T *out) { + int i = 0; + int loops = len / 2 * 2; + + bool aligned_half2 = + (reinterpret_cast(out) % sizeof(__half2) == 0); + + if (aligned_half2) { + for (i = tid * 2; i < loops; i += threads_per_block * 2) { + __half2 value2; + T value_1 = in[i]; + T value_2 = in[i + 1]; + value2.x = *reinterpret_cast<__half *>(&value_1); + value2.y = *reinterpret_cast<__half *>(&value_2); + atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); + } + for (; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } else { + for (int i = tid; i < len; i += threads_per_block) { + fastAtomicAdd(out, i, len, in[i]); + } + } +} +#endif #endif CUDA_ATOMIC_WRAPPER(Add, complex) { From 2136bd42910d759f54dec111779dd3f1d2218db6 Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Thu, 24 Feb 2022 10:23:26 +0800 Subject: [PATCH 14/85] Fix a bug in IndexKernel out-of-memory (#39867) --- paddle/fluid/operators/index_impl.cu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index bae0d3f569f5f..3d6a5e0ea88a2 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -45,7 +45,7 @@ __global__ void VectorizedIndexKernel(T *out, int numel, int main_offset, BLOCK_NUM_X * VecSize); } int num = numel - data_offset; - if (numel > 0) { + if (num > 0) { kps::InitWithDataIndex(&args[0], data_offset); kps::ElementwiseUnary(&result[0], &args[0], func); From 6b5749eb80b094e95bdca983f5786e4807472e48 Mon Sep 17 00:00:00 2001 From: wanghuancoder Date: Thu, 24 Feb 2022 10:37:40 +0800 Subject: [PATCH 15/85] [Eager] save load testcase (#39571) * eager, test=develop * fix bug, test=develop * eager, test=develop * merge legacy to fluid * eager, test=develop * eager, test=develop * Refactor TensorAdd func by template and remove gradient_accumulation in eager * Remove needless target name * eager, test=develop * eager, test=develop * Use overload instead of template * Remove legacy code * Remove legacy code * selectedrows, test=develop * Remove DataType test * eager, test=develop * eager, test=develop * support gan, test=develop * Using Tensor directly instead of using EagerTensor * support gradient_accumulation * make test_imperative_lod_tensor_to_selected_rows longer * make test_imperative_lod_tensor_to_selected_rows longer * refine code * ptb, test=develop * Rename all EagerTensor to Tensor * Rename some EagerTensor to Tensor * rename EagerTensor to EagerVariable * eager, test=develop * eager, test=develop * eager, test=develop * eager, test=develop * add more test * eager, test=develop * Support copiable selected rows and merge develop * save load, eager, test=develop * save load, eager, test=develop * refine, test=develop * refine, test=develop * refine, test=develop * revert static_runner, test=develop * EagerTensor to Tensor, test=develop * refine, test=develop * refine, test=develop * clear grad, test=develop * merge, develop * merge, develop * merge, test=develop * merge, test=develop Co-authored-by: JiabinYang <360788950@qq.com> Co-authored-by: Weilong Wu --- paddle/fluid/pybind/eager.cc | 15 +- python/paddle/fluid/dygraph/checkpoint.py | 6 +- python/paddle/fluid/dygraph/io.py | 130 +++++++++++++----- .../unittests/test_imperative_save_load.py | 57 +++++--- .../unittests/test_imperative_save_load_v2.py | 60 +++++--- python/paddle/framework/io.py | 23 +++- 6 files changed, 199 insertions(+), 92 deletions(-) diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 3867336764834..2296169a16104 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -64,12 +64,6 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, framework::proto::VarType::Type var_type = paddle::framework::proto::VarType::LOD_TENSOR) { auto ddims = phi::make_ddim(dims); - PADDLE_ENFORCE_GE( - phi::product(ddims), 0, - paddle::platform::errors::InvalidArgument( - "Create Eager Tensor with dims contain minus num is ilegal" - "Please check your code and make sure you new a " - "eager tensor with fixed shape instead of using -1.")); self->tensor.set_name(name); auto autograd_meta = egr::EagerUtils::autograd_meta(&(self->tensor)); autograd_meta->SetPersistable(persistable); @@ -83,13 +77,10 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, phi::make_intrusive(place), phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype), ddims)); - dense_tensor->mutable_data(place); + if (phi::product(ddims) > 0) { + dense_tensor->mutable_data(place); + } self->tensor.set_impl(dense_tensor); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "We only support LoDTensor to be constructed by this initializer, " - "please check your var type first and make sure you are going to " - "construct LoDTensor.")); } if (!autograd_meta->GetMutableGradNode()) { diff --git a/python/paddle/fluid/dygraph/checkpoint.py b/python/paddle/fluid/dygraph/checkpoint.py index a98dc5a79aec3..3776599daab16 100644 --- a/python/paddle/fluid/dygraph/checkpoint.py +++ b/python/paddle/fluid/dygraph/checkpoint.py @@ -17,7 +17,7 @@ import os import collections import functools -from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer +from ..framework import Variable, default_main_program, in_dygraph_mode, dygraph_only, Parameter, ParamBase, _varbase_creator, _dygraph_tracer, EagerParamBase import pickle from . import learning_rate_scheduler import warnings @@ -94,7 +94,7 @@ def save_dygraph(state_dict, model_path): param_num = 0 for k, v in state_dict.items(): - if isinstance(v, ParamBase): + if isinstance(v, (ParamBase, EagerParamBase)): param_num += 1 if param_num == 0: @@ -103,7 +103,7 @@ def save_dygraph(state_dict, model_path): model_dict = {} name_table = {} for k, v in state_dict.items(): - if isinstance(v, (Variable, core.VarBase)): + if isinstance(v, (Variable, core.VarBase, core.eager.Tensor)): model_dict[k] = v.numpy() name_table[k] = v.name else: diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py index 9ffdea969be5d..aad7737350961 100644 --- a/python/paddle/fluid/dygraph/io.py +++ b/python/paddle/fluid/dygraph/io.py @@ -535,12 +535,20 @@ def _load_persistable_vars_by_program(model_path, orig_each_name = program_holder._suffix_varname_dict[each_var.name()] if _is_parameter(each_var, program_holder.infer_program): # create output varbase - new_var = framework.ParamBase( - shape=each_var.shape(), - dtype=each_var.dtype(), - name=each_var.name(), - type=each_var.type(), - persistable=True) + if framework._in_eager_mode(): + new_var = framework.EagerParamBase( + shape=each_var.shape(), + dtype=each_var.dtype(), + name=each_var.name(), + type=each_var.type(), + persistable=True) + else: + new_var = framework.ParamBase( + shape=each_var.shape(), + dtype=each_var.dtype(), + name=each_var.name(), + type=each_var.type(), + persistable=True) else: new_var = framework._varbase_creator( type=each_var.type(), @@ -620,11 +628,22 @@ def _load_persistable_vars(model_path, var_info_path, program_holder, # create output varbase if extra_var_info[name].get('trainable', None) is not None: # use default shape and dtype - new_var = framework.ParamBase( - shape=[1], # only to pass check, this shape is not meaningful - dtype=core.VarDesc.VarType.FP32, - name=new_name, - persistable=True) + if framework._in_eager_mode(): + new_var = framework.EagerParamBase( + shape=[ + 1 + ], # only to pass check, this shape is not meaningful + dtype=core.VarDesc.VarType.FP32, + name=new_name, + persistable=True) + else: + new_var = framework.ParamBase( + shape=[ + 1 + ], # only to pass check, this shape is not meaningful + dtype=core.VarDesc.VarType.FP32, + name=new_name, + persistable=True) else: new_var = framework._varbase_creator( name=new_name, persistable=True) @@ -747,18 +766,26 @@ def _run_dygraph(instance, input, program_holder): # 1. prepare inputs, outputs, attrs input_vars = [] for i, value in enumerate(input): - if not isinstance(value, (np.ndarray, core.VarBase)): + if not isinstance(value, (np.ndarray, core.VarBase, core.eager.Tensor)): raise TypeError( "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s." % type(value)) # NOTE: In order to unify the API, firstly convert the input to VarBase if isinstance(value, np.ndarray): - var = core.VarBase( - value=value, - name=program_holder.input_descs[i].name(), - persistable=False, - place=framework._current_expected_place(), - zero_copy=True) + if framework._in_eager_mode(): + var = core.eager.Tensor( + value=value, + name=program_holder.input_descs[i].name(), + persistable=False, + place=framework._current_expected_place(), + zero_copy=True) + else: + var = core.VarBase( + value=value, + name=program_holder.input_descs[i].name(), + persistable=False, + place=framework._current_expected_place(), + zero_copy=True) else: var = value # NOTE: we changed var name here, @@ -784,30 +811,62 @@ def _run_dygraph(instance, input, program_holder): output_vars = [] for var_desc in program_holder.output_descs: - var = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), var_desc.type(), False) + if framework._in_eager_mode(): + var = core.eager.Tensor( + dtype=var_desc.dtype(), + dims=var_desc.shape(), + name=var_desc.name(), + type=var_desc.type(), + persistable=False) + else: + var = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) output_vars.append(var) # hold forward variables - tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], - "program_out_scope", - core.VarDesc.VarType.STEP_SCOPES, True) + if framework._in_eager_mode(): + tmp_scope_vec = core.eager.Tensor( + dtype=core.VarDesc.VarType.FP32, + dims=[], + name="program_out_scope", + type=core.VarDesc.VarType.STEP_SCOPES, + persistable=True) + else: + tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [], + "program_out_scope", + core.VarDesc.VarType.STEP_SCOPES, True) tmp_scope_vec.value().set_scope(program_holder.scope) double_grad_vars = [] for var_desc in program_holder.double_grad_descs: - var = core.VarBase(var_desc.dtype(), - var_desc.shape(), - var_desc.name(), var_desc.type(), False) + if framework._in_eager_mode(): + var = core.eager.Tensor( + dtype=var_desc.dtype(), + dims=var_desc.shape(), + name=var_desc.name(), + type=var_desc.type(), + persistable=False) + else: + var = core.VarBase(var_desc.dtype(), + var_desc.shape(), + var_desc.name(), var_desc.type(), False) double_grad_vars.append(var) if len(double_grad_vars) == 0: - double_grad_vars = [ - core.VarBase( - value=[1], - name='Fake_var', - place=framework._current_expected_place()) - ] + if framework._in_eager_mode(): + double_grad_vars = [ + core.eager.Tensor( + value=[1], + name='Fake_var', + place=framework._current_expected_place()) + ] + else: + double_grad_vars = [ + core.VarBase( + value=[1], + name='Fake_var', + place=framework._current_expected_place()) + ] # 2. run program by op trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program @@ -1215,11 +1274,12 @@ def __init__(self, programs, persistable_vars): # the TranslatedLayer object holded var names count started from 0 with unique_name.guard(): for name, var in persistable_vars.items(): - if isinstance(var, framework.ParamBase): + if isinstance(var, + (framework.ParamBase, framework.EagerParamBase)): dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX) self._persistable_var_name_dict[name] = dy_name self.add_parameter(dy_name, var) - elif isinstance(var, core.VarBase): + elif isinstance(var, (core.VarBase, core.eager.Tensor)): dy_name = _generate_unique_var_name(BUFFER_NAME_PREFIX) self._persistable_var_name_dict[name] = dy_name self.register_buffer(dy_name, var) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py index 6c6b164bdec68..160c94a549c91 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py @@ -27,6 +27,7 @@ import numpy as np import six import paddle +from paddle.fluid.framework import _test_eager_guard class SimpleLSTMRNN(fluid.Layer): @@ -208,7 +209,7 @@ def forward(self, input, label, init_hidden, init_cell): class TestDygraphPtbRnn(unittest.TestCase): - def setUp(self): + def func_setUp(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -277,7 +278,7 @@ def setUp(self): self.opti_dict = adam.state_dict() self.base_opti = {} for k, v in self.opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.base_opti[v.name] = v.numpy() self.assertTrue(np.sum(np.abs(v.numpy())) != 0) else: @@ -294,7 +295,7 @@ def setUp(self): fluid.save_dygraph(self.state_dict, "./test_dy") - def testLoadAndSetVarBase(self): + def func_testLoadAndSetVarBase(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -363,7 +364,7 @@ def testLoadAndSetVarBase(self): opti_dict = adam.state_dict() # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) @@ -374,11 +375,12 @@ def testLoadAndSetVarBase(self): adam._learning_rate.step_num = 0 para_state_dict, opti_state_dict = fluid.load_dygraph("./test_dy") + print(opti_state_dict.keys()) adam.set_state_dict(opti_state_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -403,7 +405,7 @@ def testLoadAndSetVarBase(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetVariable(self): + def func_testSetVariable(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -472,7 +474,7 @@ def testSetVariable(self): opti_dict = adam.state_dict() # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) @@ -485,7 +487,7 @@ def testSetVariable(self): adam.set_state_dict(self.opti_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -510,7 +512,7 @@ def testSetVariable(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetNumpy(self): + def func_testSetNumpy(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -580,7 +582,7 @@ def testSetNumpy(self): np_opti_dict = {} # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() np_opti_dict[v.name] = np_t var = v.value().get_tensor() @@ -596,7 +598,7 @@ def testSetNumpy(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -623,7 +625,7 @@ def testSetNumpy(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetVariableBeforeTrain(self): + def func_testSetVariableBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -700,7 +702,7 @@ def testSetVariableBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testLoadAndSetVarBaseBeforeTrain(self): + def func_testLoadAndSetVarBaseBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -791,7 +793,7 @@ def testLoadAndSetVarBaseBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testSetNumpyBeforeTrain(self): + def func_testSetNumpyBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -840,7 +842,7 @@ def testSetNumpyBeforeTrain(self): np_state_dict = {} for k, v in self.opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_opti_dict[v.name] = v.numpy() else: np_opti_dict[k] = v @@ -894,7 +896,7 @@ def testSetNumpyBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testOnlyLoadParams(self): + def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) state_dict = emb.state_dict() @@ -911,7 +913,7 @@ def testOnlyLoadParams(self): para_state_dict, opti_state_dict = fluid.load_dygraph( os.path.join('saved_dy', 'emb_dy.pdopt')) - def test_load_compatible_with_keep_name_table(self): + def func_test_load_compatible_with_keep_name_table(self): with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) state_dict = emb.state_dict() @@ -922,6 +924,27 @@ def test_load_compatible_with_keep_name_table(self): self.assertTrue(para_state_dict != None) self.assertTrue(opti_state_dict == None) + def test_main(self): + self.func_setUp() + self.func_testLoadAndSetVarBase() + self.func_testSetVariable() + self.func_testSetNumpy() + self.func_testSetVariableBeforeTrain() + self.func_testLoadAndSetVarBaseBeforeTrain() + self.func_testSetNumpyBeforeTrain() + self.func_testOnlyLoadParams() + self.func_test_load_compatible_with_keep_name_table() + with _test_eager_guard(): + self.func_setUp() + self.func_testLoadAndSetVarBase() + self.func_testSetVariable() + self.func_testSetNumpy() + self.func_testSetVariableBeforeTrain() + self.func_testLoadAndSetVarBaseBeforeTrain() + self.func_testSetNumpyBeforeTrain() + self.func_testOnlyLoadParams() + self.func_test_load_compatible_with_keep_name_table() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py index 9f0dcdb4d8f0c..7e7b2e2fd5206 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py @@ -27,6 +27,7 @@ import numpy as np import six import paddle +from paddle.fluid.framework import _test_eager_guard class SimpleLSTMRNN(fluid.Layer): @@ -208,7 +209,7 @@ def forward(self, input, label, init_hidden, init_cell): class TestDygraphPtbRnn(unittest.TestCase): - def setUp(self): + def func_setUp(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -279,7 +280,7 @@ def setUp(self): self.opti_dict = adam.state_dict() self.base_opti = {} for k, v in self.opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.base_opti[v.name] = v.numpy() self.assertTrue(np.sum(np.abs(v.numpy())) != 0) else: @@ -296,7 +297,7 @@ def setUp(self): paddle.save(self.state_dict, "./test_dy_v2.pdparams") - def testLoadAndSetVarBase(self): + def func_testLoadAndSetVarBase(self): self.setUp() seed = 90 hidden_size = 10 @@ -367,7 +368,7 @@ def testLoadAndSetVarBase(self): opti_dict = adam.state_dict() # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) @@ -380,7 +381,7 @@ def testLoadAndSetVarBase(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -405,7 +406,7 @@ def testLoadAndSetVarBase(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetVariable(self): + def func_testSetVariable(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -475,7 +476,7 @@ def testSetVariable(self): opti_dict = adam.state_dict() # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() var = v.value().get_tensor() var.set(np.zeros_like(np_t), place) @@ -488,7 +489,7 @@ def testSetVariable(self): adam.set_state_dict(self.opti_dict) opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -513,7 +514,7 @@ def testSetVariable(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetNumpy(self): + def func_testSetNumpy(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -584,7 +585,7 @@ def testSetNumpy(self): np_opti_dict = {} # set to zero for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_t = v.numpy() np_opti_dict[v.name] = np_t var = v.value().get_tensor() @@ -600,7 +601,7 @@ def testSetNumpy(self): opti_dict = adam.state_dict() for k, v in opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): self.assertTrue( np.array_equal(v.numpy(), self.base_opti[v.name])) else: @@ -627,7 +628,7 @@ def testSetNumpy(self): self.assertTrue(np.array_equal(new_t, base_t)) - def testSetVariableBeforeTrain(self): + def func_testSetVariableBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -706,7 +707,7 @@ def testSetVariableBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testLoadAndSetVarBaseBeforeTrain(self): + def func_testLoadAndSetVarBaseBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -797,7 +798,7 @@ def testLoadAndSetVarBaseBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testSetNumpyBeforeTrain(self): + def func_testSetNumpyBeforeTrain(self): seed = 90 hidden_size = 10 vocab_size = 1000 @@ -846,7 +847,7 @@ def testSetNumpyBeforeTrain(self): np_state_dict = {} for k, v in self.opti_dict.items(): - if isinstance(v, core.VarBase): + if isinstance(v, (core.VarBase, core.eager.Tensor)): np_opti_dict[v.name] = v.numpy() else: np_opti_dict[k] = v @@ -902,7 +903,7 @@ def testSetNumpyBeforeTrain(self): base_t = self.model_base[k] self.assertTrue(np.array_equal(new_t, base_t)) - def testOnlyLoadParams(self): + def func_testOnlyLoadParams(self): with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) state_dict = emb.state_dict() @@ -911,7 +912,7 @@ def testOnlyLoadParams(self): para_state_dict = paddle.load( os.path.join('saved_dy', 'emb_dy.pdparams')) - def test_no_state_in_input_dict(self): + def func_test_no_state_in_input_dict(self): with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) state_dict = emb.state_dict() @@ -923,7 +924,7 @@ def test_no_state_in_input_dict(self): emb.set_state_dict(para_state_dict) - def test_state_shape_mismatch(self): + def func_test_state_shape_mismatch(self): with fluid.dygraph.guard(): emb = fluid.dygraph.Embedding([10, 10]) state_dict = emb.state_dict() @@ -936,6 +937,29 @@ def test_state_shape_mismatch(self): emb.set_state_dict(para_state_dict) + def test_main(self): + self.func_setUp() + self.func_testLoadAndSetVarBase() + self.func_testSetVariable() + self.func_testSetNumpy() + self.func_testSetVariableBeforeTrain() + self.func_testLoadAndSetVarBaseBeforeTrain() + self.func_testSetNumpyBeforeTrain() + self.func_testOnlyLoadParams() + self.func_test_no_state_in_input_dict() + self.func_test_state_shape_mismatch() + with _test_eager_guard(): + self.func_setUp() + self.func_testLoadAndSetVarBase() + self.func_testSetVariable() + self.func_testSetNumpy() + self.func_testSetVariableBeforeTrain() + self.func_testLoadAndSetVarBaseBeforeTrain() + self.func_testSetNumpyBeforeTrain() + self.func_testOnlyLoadParams() + self.func_test_no_state_in_input_dict() + self.func_test_state_shape_mismatch() + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py index 8367205a7e7c2..94b8bd29b2c19 100644 --- a/python/paddle/framework/io.py +++ b/python/paddle/framework/io.py @@ -30,7 +30,7 @@ from paddle.fluid.io import _legacy_save as _legacy_static_save from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer -from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, _current_expected_place, Program +from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, in_dygraph_mode, ParamBase, EagerParamBase, _current_expected_place, Program from paddle.fluid.dygraph.jit import _SaveLoadConfig from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX @@ -42,7 +42,7 @@ def _build_saved_state_dict(state_dict): save_dict = {} name_table = {} for key, value in state_dict.items(): - if isinstance(value, (Variable, core.VarBase)): + if isinstance(value, (Variable, core.VarBase, core.eager.Tensor)): if value.type == core.VarDesc.VarType.VOCAB: save_dict[key] = value.value().get_map_tensor() else: @@ -260,6 +260,8 @@ def add_dispatch_table(): # This is not a good method, because the pickle module has been modified. pickle.dispatch_table[core.VarBase] = reduce_varbase pickle.dispatch_table[ParamBase] = reduce_varbase + pickle.dispatch_table[core.eager.Tensor] = reduce_varbase + pickle.dispatch_table[EagerParamBase] = reduce_varbase pickle.dispatch_table[core.LoDTensor] = reduce_LoDTensor pickle.dispatch_table.update(dispatch_table_layer) @@ -267,6 +269,8 @@ def pop_dispatch_table(): pickle.dispatch_table.pop(core.VarBase) pickle.dispatch_table.pop(core.LoDTensor) pickle.dispatch_table.pop(ParamBase) + pickle.dispatch_table.pop(core.eager.Tensor) + pickle.dispatch_table.pop(EagerParamBase) for k in dispatch_table_layer: pickle.dispatch_table.pop(k) @@ -286,6 +290,8 @@ def pop_dispatch_table(): pickler.dispatch_table[core.VarBase] = reduce_varbase pickler.dispatch_table[core.LoDTensor] = reduce_LoDTensor pickler.dispatch_table[ParamBase] = reduce_varbase + pickler.dispatch_table[core.eager.Tensor] = reduce_varbase + pickler.dispatch_table[EagerParamBase] = reduce_varbase pickler.dispatch_table.update(dispatch_table_layer) pickler.dump(obj) @@ -317,7 +323,8 @@ def _is_state_dict(obj): def condition(obj): return isinstance(obj, (fluid.Layer, Program, core.VarBase, - core.LoDTensor, core.SelectedRows)) + core.eager.Tensor, core.LoDTensor, + core.SelectedRows)) # If the value of a dict is a core.VarBase/LoDTensor or a dict # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows), @@ -327,7 +334,8 @@ def condition(obj): for k, v in value.items(): if _contain_x(v, condition): return False - elif not isinstance(value, (core.VarBase, core.LoDTensor)): + elif not isinstance(value, (core.VarBase, core.eager.Tensor, + core.LoDTensor)): return False return True @@ -412,8 +420,9 @@ def _parse_every_object(obj, condition_func, convert_func): elif type(obj) == set: return set(_parse_every_object(list(obj), condition_func, convert_func)) else: - if isinstance(obj, collections.Iterable) and not isinstance(obj, ( - str, np.ndarray, core.VarBase, core.LoDTensor)): + if isinstance(obj, collections.Iterable) and not isinstance( + obj, + (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)): raise NotImplementedError( "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.". format(type(obj))) @@ -541,7 +550,7 @@ def _save_binary_var(obj, path): _save_lod_tensor(obj, path) elif isinstance(obj, core.SelectedRows): _save_selected_rows(obj, path) - elif isinstance(obj, core.VarBase): + elif isinstance(obj, (core.VarBase, core.eager.Tensor)): _save_lod_tensor(obj.value().get_tensor(), path) else: # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows' From 5fd7b5c3092bef2e48817da8849c267835a43890 Mon Sep 17 00:00:00 2001 From: JZ-LIANG Date: Thu, 24 Feb 2022 10:42:13 +0800 Subject: [PATCH 16/85] fix bug for block state (#39854) --- python/paddle/distributed/auto_parallel/engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py index 98b76056a15a4..8efb9eb719237 100644 --- a/python/paddle/distributed/auto_parallel/engine.py +++ b/python/paddle/distributed/auto_parallel/engine.py @@ -139,6 +139,9 @@ def _plan(self): self._completer = Completer(self._dist_contexts[self.mode]) self._completer.complete_forward_annotation(serial_main_prog) # TODO: add auto planner process + # parse forward sub block + self._dist_contexts[self.mode].block_state.parse_forward_blocks( + serial_main_prog) def _parallel(self, rank): serial_main_program = self._serial_main_progs[self.mode] @@ -177,6 +180,8 @@ def _generate_backward(self, main_program, startup_program, loss): loss, distop_context=self._dist_contexts[self.mode].dist_op_context) self._completer.complete_backward_annotation(main_program) + self._dist_contexts[self.mode].block_state.parse_backward_blocks( + main_program) return params_grads def _generate_optimizer(self, main_program, startup_program, params_grads): From 867224b26254d13046f7287993eebad995ee3735 Mon Sep 17 00:00:00 2001 From: Huihuang Zheng Date: Thu, 24 Feb 2022 10:52:17 +0800 Subject: [PATCH 17/85] Add Note for Place of Executor in Parallel Environment (#39063) Add note for Place of Executor in parallel environment --- python/paddle/fluid/executor.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 5ae1403f632b6..447d6457e0a3c 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -622,7 +622,9 @@ class Executor(object): is CPU version, the default device would be set to `CPUPlace()` . If Paddle is GPU version, the default device would be set to `CUDAPlace(0)` . Default is None. If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x`` - is the index of the GPUs. + is the index of the GPUs. Note: users only pass one Place or None to initialize + Executor when using multiple-cards. Other APIs will override the cards. See + `document for multiple-cards `_ Returns: Executor From c969955663c39de0c399f1b08c3cadbbb8076680 Mon Sep 17 00:00:00 2001 From: huangxu96 <46740794+huangxu96@users.noreply.github.com> Date: Thu, 24 Feb 2022 11:07:36 +0800 Subject: [PATCH 18/85] Optimize where_op and abs_grad_op by the elementwise interface (#39609) * Optimize the where_op by the elementwise_op funtion * Modified where_op & abs_grad_op by elementwise interface --- paddle/fluid/operators/where_op.cu | 19 ++++++-- paddle/phi/kernels/funcs/complex_functors.h | 47 +++++++++++++++++++ .../phi/kernels/impl/abs_grad_kernel_impl.h | 22 +++++++++ 3 files changed, 84 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu index 54b0d5b69086c..61a1691e4fe26 100644 --- a/paddle/fluid/operators/where_op.cu +++ b/paddle/fluid/operators/where_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/where_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" @@ -20,6 +21,15 @@ namespace platform = paddle::platform; namespace paddle { namespace operators { +template +struct CondFunctor { + HOSTDEVICE inline CondFunctor() {} + + HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const { + return cond ? x : y; + } +}; + template __global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x, const T* y, T* out) { @@ -63,10 +73,11 @@ class WhereKernel auto stream = context.cuda_device_context().stream(); auto& dev_ctx = context.template device_context(); - auto config = GetGpuLaunchConfig1D(dev_ctx, numel); - WhereCUDAKernel< - T><<>>( - numel, cond_data, x_data, y_data, out_data); + auto functor = CondFunctor(); + std::vector ins = {condition, X, Y}; + std::vector outs = {out}; + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } }; diff --git a/paddle/phi/kernels/funcs/complex_functors.h b/paddle/phi/kernels/funcs/complex_functors.h index 450adfcc68b7e..86dbdd099ecde 100644 --- a/paddle/phi/kernels/funcs/complex_functors.h +++ b/paddle/phi/kernels/funcs/complex_functors.h @@ -154,6 +154,53 @@ struct AbsFunctor>> { int64_t numel_; }; +template +struct AbsGradCUDAFunctor { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + + HOSTDEVICE inline T operator()(const T x, const T dout) const { + T output; + if (x == T(0)) { + output = T(0); + } else { + output = T(dout) * (x / T(std::abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const float dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + +template <> +struct AbsGradCUDAFunctor> { + HOSTDEVICE inline AbsGradCUDAFunctor() {} + HOSTDEVICE inline phi::dtype::complex operator()( + const phi::dtype::complex x, const double dout) const { + phi::dtype::complex output; + if (x == phi::dtype::complex(0)) { + output = phi::dtype::complex(0); + } else { + output = phi::dtype::complex(dout) * + (x / phi::dtype::complex(abs(x))); + } + return output; + } +}; + template struct AbsGradFunctor { AbsGradFunctor(const Real* dout, const T* x, T* output, int64_t numel) diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 939bc49c9fc67..4b31393a71f36 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -17,9 +17,30 @@ #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/abs_grad_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" namespace phi { +#if defined(__NVCC__) +template +void AbsGradKernelImpl(const GPUContext& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { + std::vector ins = {&x, &dout}; + std::vector outs = {dx}; + dev_ctx.Alloc(dx); + phi::funcs::AbsGradCUDAFunctor abs_grad_cuda_functor; + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, abs_grad_cuda_functor); +} +template +void AbsGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& dout, + DenseTensor* dx) { + AbsGradKernelImpl(dev_ctx, x, dout, dx); +} +#else template void AbsGradKernel(const Context& ctx, const DenseTensor& x, @@ -37,6 +58,7 @@ void AbsGradKernel(const Context& ctx, for_range(functor); } +#endif template void AbsDoubleGradKernel(const Context& ctx, const DenseTensor& x, From 75f91ce41b0b09e01446695a5d8909d710213dc8 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 24 Feb 2022 04:12:58 +0100 Subject: [PATCH 19/85] Fix for split op in BF16 inference (#39548) * Fix for split bf16 inference * added test for pass * changes after review --- .../framework/ir/graph_pattern_detector.cc | 9 + .../framework/ir/graph_pattern_detector.h | 9 + .../framework/ir/mkldnn/cpu_bfloat16_pass.cc | 166 +++++++++++++----- .../ir/mkldnn/cpu_bfloat16_pass_tester.cc | 29 ++- paddle/phi/kernels/cpu/split_kernel.cc | 3 +- 5 files changed, 168 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 63559e201594a..e4c9dc72128f4 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2516,6 +2516,15 @@ PDNode *patterns::DuplicatedInputs::operator()() { return op; } +PDNode *patterns::DuplicatedOutputs::operator()() { + auto op = pattern->NewNode(op_repr())->assert_is_ops({"split"}); + op->assert_more([&](Node *node) { + return node->Op()->GetAttrIfExists("mkldnn_data_type") == + "bfloat16"; + }); + return op; +} + PDNode *patterns::MKLDNNInPlace::operator()() { const std::unordered_set &supported_op_types = { "abs", "gelu", "leaky_relu", "relu", "softmax", "sqrt", "swish", "tanh"}; diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 79f1d63a15190..d6400ed6945bf 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1495,6 +1495,15 @@ struct DuplicatedInputs : public PatternBase { PATTERN_DECL_NODE(op); }; +struct DuplicatedOutputs : public PatternBase { + DuplicatedOutputs(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "many_outputs_op") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(op); +}; + // Pattern used for enforcing inplace computation for in-place computation // supporting DNNL ops. softmax, batch_norm and layer_norm struct MKLDNNInPlace : public PatternBase { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc index 5f9aefc1e7a0b..f1bd34a5ad4f6 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass.cc @@ -52,7 +52,7 @@ bool IsPermittedOutputName(const std::string& output_name) { } void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in, - int* quantize_counter) { + int& quantize_counter) { std::vector input_names; // Find the name of the input linking op to op_in @@ -87,10 +87,10 @@ void AddQuantize(Graph* g, ir::Node* op, ir::Node* op_in, IR_NODE_LINK_TO(op_in, quantize_op); IR_NODE_LINK_TO(quantize_op, quantize_out_node); IR_NODE_LINK_TO(quantize_out_node, op); - (*quantize_counter)++; + quantize_counter++; } -void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { +void AddQuantizes(Graph* g, ir::Node* op, int& quantize_counter) { auto inputs = op->inputs; PADDLE_ENFORCE_GE(inputs.size(), 1, platform::errors::InvalidArgument( @@ -127,7 +127,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { IR_NODE_LINK_TO(inputs[i], quantize_op); IR_NODE_LINK_TO(quantize_op, quantize_out_nodes[i]); IR_NODE_LINK_TO(quantize_out_nodes[i], op); - (*quantize_counter)++; + quantize_counter++; } op->Op()->SetInput("X", quantize_out_node_names); @@ -136,7 +136,7 @@ void AddQuantizes(Graph* g, ir::Node* op, int* quantize_counter) { // Operators like Concat and Sum have a single input name X, which actually // consists of multiple inputs. Such operators require a different way to find // pattern and add quantize ops. -void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) { +void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int& quantize_counter) { GraphPatternDetector gpd; patterns::DuplicatedInputs duplicated_inputs{gpd.mutable_pattern(), "duplicated_inputs"}; @@ -151,7 +151,7 @@ void AddReoderBeforeDuplicatedInputs(ir::Graph* graph, int* quantize_counter) { // Adding quantize ops before all operators except Concat and Sum, which have // already been handled in AddReoderBeforeDuplicatedInputs -void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) { +void AddReoderBeforeSingleInputs(ir::Graph* graph, int& quantize_counter) { GraphPatternDetector gpd; patterns::FirstBfloat16Ops bfloat16_ops{gpd.mutable_pattern(), "first_bfloat16_ops"}; @@ -169,60 +169,134 @@ void AddReoderBeforeSingleInputs(ir::Graph* graph, int* quantize_counter) { void CPUBFloat16Pass::SetInputDataType(ir::Graph* graph) const { int quantize_counter = 0; - AddReoderBeforeDuplicatedInputs(graph, &quantize_counter); - AddReoderBeforeSingleInputs(graph, &quantize_counter); + AddReoderBeforeDuplicatedInputs(graph, quantize_counter); + AddReoderBeforeSingleInputs(graph, quantize_counter); PrettyLogDetail("--- added %d quantize ops before bfloat16 op", quantize_counter); } -void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const { +void AddDequantize(Graph* g, ir::Node* op, ir::Node* op_out, + int& dequantize_counter) { + if (op->Op()->Type() == "prior_box") return; + + // Find the name of the output linking op to op_out + std::vector output_names; + for (auto name : op->Op()->OutputNames()) + for (auto output_name : op->Op()->Output(name)) + if (output_name == op_out->Name() && IsPermittedOutputName(name)) + output_names.push_back(name); + + if (output_names.empty()) return; + + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + deq_desc.SetInput("Input", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetOutput("Output", std::vector({op_out->Name()})); + deq_desc.SetAttr("Scale", 1.0f); + deq_desc.SetAttr("Shift", 0.0f); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + for (auto name = output_names.begin(); name < output_names.end(); name++) + op->Op()->SetOutput(*name, + std::vector({dequantize_in_node->Name()})); + + UnlinkNodes(op, op_out); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, op_out); + + dequantize_counter++; +} + +void AddDequantizes(Graph* g, ir::Node* op, int& dequantize_counter) { + auto outputs = op->outputs; + PADDLE_ENFORCE_GE(outputs.size(), 1, + platform::errors::InvalidArgument( + "OP(%s)'s outputs(%d) must be equal or greater than 1.", + op->Name(), outputs.size())); + PADDLE_ENFORCE_EQ(op->inputs.size(), 1, + platform::errors::InvalidArgument( + "OP(%s)'s inputs(%d) must be equal to 1.", op->Name(), + op->inputs.size())); + + OpDesc deq_desc; + deq_desc.SetType("dequantize"); + + std::vector dequantize_in_nodes(outputs.size()); + std::vector dequantize_in_node_names(outputs.size()); + + for (size_t i = 0; i < outputs.size(); i++) { + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + dequantize_in_nodes[i] = g->CreateVarNode(&dequantize_in_desc); + dequantize_in_node_names[i] = dequantize_in_nodes[i]->Name(); + + deq_desc.SetInput("Input", + std::vector({dequantize_in_node_names[i]})); + deq_desc.SetOutput("Output", + std::vector({outputs[i]->Name()})); + + deq_desc.SetAttr("Scale", 1.f); + deq_desc.SetAttr("Shift", 0.0f); + deq_desc.SetAttr("bfloat16", true); + deq_desc.SetAttr("output_format", op->Op()->HasAttr("data_layout") + ? op->Op()->GetAttr("data_layout") + : std::string("NCHW")); + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + + UnlinkNodes(op, outputs[i]); + IR_NODE_LINK_TO(op, dequantize_in_nodes[i]); + IR_NODE_LINK_TO(dequantize_in_nodes[i], dequantize_op); + IR_NODE_LINK_TO(dequantize_op, outputs[i]); + + dequantize_counter++; + } + + op->Op()->SetOutput("Out", dequantize_in_node_names); +} + +// Operators like split have a single output name Out, which actually +// consists of multiple outputs. Such operators require a different way to find +// pattern and add dequantize ops. +void AddReoderAfterDuplicatedOutputs(ir::Graph* graph, + int& dequantize_counter) { + GraphPatternDetector gpd; + patterns::DuplicatedOutputs duplicated_outputs{gpd.mutable_pattern(), + "duplicated_outputs"}; + duplicated_outputs(); + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(op, op, duplicated_outputs); + AddDequantizes(g, op, dequantize_counter); + }; + gpd(graph, handler); +} + +// Adding dequantize ops after all operators except split, which has +// already been handled in AddReoderAfterDuplicatedOutputs +void AddReoderAfterSingleOutputs(ir::Graph* graph, int& dequantize_counter) { GraphPatternDetector gpd; patterns::LastBfloat16Ops bfloat16_ops{gpd.mutable_pattern(), "last_bfloat16_ops"}; bfloat16_ops(); - int dequantize_counter = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops); GET_IR_NODE_FROM_SUBGRAPH(op_out, op_out, bfloat16_ops); - - if (op->Op()->Type() != "prior_box") { - // Find the name of the output linking op to op_out - std::vector output_names; - for (auto name : op->Op()->OutputNames()) - for (auto output_name : op->Op()->Output(name)) - if (output_name == op_out->Name() && IsPermittedOutputName(name)) - output_names.push_back(name); - - if (output_names.empty()) return; - - VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); - auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); - - OpDesc deq_desc; - deq_desc.SetType("dequantize"); - deq_desc.SetInput("Input", - std::vector({dequantize_in_node->Name()})); - deq_desc.SetOutput("Output", std::vector({op_out->Name()})); - deq_desc.SetAttr("Scale", 1.0f); - deq_desc.SetAttr("Shift", 0.0f); - auto dequantize_op = - g->CreateOpNode(&deq_desc); // OpDesc will be copied. - - for (auto name = output_names.begin(); name < output_names.end(); name++) - op->Op()->SetOutput( - *name, std::vector({dequantize_in_node->Name()})); - - UnlinkNodes(op, op_out); - IR_NODE_LINK_TO(op, dequantize_in_node); - IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); - IR_NODE_LINK_TO(dequantize_op, op_out); - - dequantize_counter++; + GET_IR_NODE_FROM_SUBGRAPH(op, op, bfloat16_ops); + if (op->Op()->Type() != "split") { + AddDequantize(g, op, op_out, dequantize_counter); } }; gpd(graph, handler); +} + +void CPUBFloat16Pass::SetOutputDataType(ir::Graph* graph) const { + int dequantize_counter = 0; + AddReoderAfterDuplicatedOutputs(graph, dequantize_counter); + AddReoderAfterSingleOutputs(graph, dequantize_counter); PrettyLogDetail("--- added %d dequantize ops after bfloat16 op", dequantize_counter); } diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc index f620b4c94fe89..877ee71fc2d85 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_pass_tester.cc @@ -45,7 +45,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, op->SetInput("Input", {inputs[0]}); op->SetOutput("Out", {outputs[0]}); op->SetAttr("mkldnn_data_type", mkldnn_data_type); - } else if (type == "concat" || type == "sum") { + } else if (type == "concat" || type == "sum" || type == "split") { op->SetInput("X", inputs); op->SetOutput("Out", outputs); op->SetAttr("mkldnn_data_type", mkldnn_data_type); @@ -117,6 +117,7 @@ TEST(CpuBfloat16Pass, convolution) { bool use_mkldnn = true; int quant_op = 3; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescConv(use_mkldnn), quant_op, dequant_op, added_nodes); } @@ -140,6 +141,7 @@ TEST(CpuBfloat16Pass, double_input_ops) { bool use_mkldnn = true; int quant_op = 4; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDoubleInput(use_mkldnn), quant_op, dequant_op, added_nodes); @@ -164,11 +166,35 @@ TEST(CpuBfloat16Pass, duplicated_input_ops) { bool use_mkldnn = true; int quant_op = 5; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDuplicatedInput(use_mkldnn), quant_op, dequant_op, added_nodes); } +ProgramDesc BuildProgramDescDuplicatedOutput(bool use_mkldnn) { + ProgramDesc prog; + for (auto& v : variable_names) { + prog.MutableBlock(0)->Var(v); + } + SetOp(&prog, "dropout", "Dropout", {"a"}, {"b"}, use_mkldnn, "float32"); + SetOp(&prog, "split", "Split", {"b"}, {"c", "d"}, use_mkldnn, "bfloat16"); + SetOp(&prog, "transpose2", "Transpose", {"c"}, {"e"}, use_mkldnn, "float32"); + SetOp(&prog, "reshape2", "Reshape", {"d"}, {"f"}, use_mkldnn, "bfloat16"); + + return prog; +} + +TEST(CpuBfloat16Pass, duplicated_output_ops) { + bool use_mkldnn = true; + int quant_op = 2; + int dequant_op = 3; + // each added op consists of 2 nodes + int added_nodes = quant_op * 2 + dequant_op * 2; + MainTest(BuildProgramDescDuplicatedOutput(use_mkldnn), quant_op, dequant_op, + added_nodes); +} + ProgramDesc BuildProgramDescDoubleOutputs(bool use_mkldnn) { ProgramDesc prog; for (auto& v : variable_names) { @@ -190,6 +216,7 @@ TEST(CpuBfloat16Pass, double_outputs_ops) { bool use_mkldnn = true; int quant_op = 3; int dequant_op = 3; + // each added op consists of 2 nodes int added_nodes = quant_op * 2 + dequant_op * 2; MainTest(BuildProgramDescDoubleOutputs(use_mkldnn), quant_op, dequant_op, added_nodes); diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc index 7b2166eaf11f9..722681fb7bc3f 100644 --- a/paddle/phi/kernels/cpu/split_kernel.cc +++ b/paddle/phi/kernels/cpu/split_kernel.cc @@ -70,4 +70,5 @@ PD_REGISTER_KERNEL(split, int64_t, int, bool, - phi::dtype::float16) {} + phi::dtype::float16, + phi::dtype::bfloat16) {} From dd2c997d6bd2f3dadfcf1ee78c2c94f36c9bd381 Mon Sep 17 00:00:00 2001 From: xiongkun Date: Thu, 24 Feb 2022 11:22:47 +0800 Subject: [PATCH 20/85] [Phi] Fix comilation dependecy in selected_rows with memory (#39834) --- paddle/phi/core/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 80bcc66477cb1..6ada063069905 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -23,7 +23,7 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_ cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim) +cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim memcpy) cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) From 94b31f90a41d653f9e587204ee71bdd49475f4d6 Mon Sep 17 00:00:00 2001 From: chentianyu03 Date: Thu, 24 Feb 2022 11:24:54 +0800 Subject: [PATCH 21/85] [pten] add optional type for infermeta (#39848) * modify infershape by args_def * add optional type for infermate * add optional type for infermate * add optional type for infermate * support scalar type * change OptionalInputAt function to none template * support phi::DataType --- paddle/fluid/framework/infershape_utils.cc | 85 ++++++++++++++++++---- paddle/phi/core/infermeta_utils.cc | 8 ++ paddle/phi/core/infermeta_utils.h | 21 ++++++ 3 files changed, 99 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index aae36cf455dfe..4bec1baeaaee9 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" #include "paddle/phi/core/compat/arg_map_context.h" #include "paddle/phi/core/compat/convert_utils.h" @@ -376,47 +377,101 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, attr_name)); } } + } else if (attr_defs[i].type_index == + std::type_index(typeid(phi::Scalar))) { + if (ctx->HasAttr(attr_name)) { + // TODO(chentianyu03): support other attrs later + auto& attr = attr_reader.GetAttr(attr_name); + if (std::type_index(attr.type()) == std::type_index(typeid(float))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(float, attr))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(std::string))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(std::string, attr))); + } else if (std::type_index(attr.type()) == + std::type_index(typeid(int))) { + infer_meta_context.EmplaceBackAttr( + phi::Scalar(BOOST_GET_CONST(int, attr))); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported cast op attribute `%s` to Scalar when construct " + "InferMetaContext.", + attr_name)); + } + } else if (ctx->HasInput(attr_name)) { + const auto& infershape_input = ctx->GetInputVarPtrs(attr_name); + if (infershape_input.size() == 1) { + if (ctx->IsRuntime()) { + Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); + infer_meta_context.EmplaceBackAttr( + std::move(experimental::MakePtenScalarFromVar(*var))); + } else { + phi::Scalar tensor_scalar(-1); + tensor_scalar.SetFromTensor(true); + infer_meta_context.EmplaceBackAttr(std::move(tensor_scalar)); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid input.size() when cast op attribute `%s` to Scalar, " + "expected 1, but actually is %d .", + attr_name, infershape_input.size())); + } + } } else if (ctx->HasAttr(attr_name)) { // Emplace Back Attr according to the type of attr. auto& attr = attr_reader.GetAttr(attr_name); - if (std::type_index(attr.type()) == std::type_index(typeid(bool))) { + if (attr_defs[i].type_index == std::type_index(typeid(bool))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(bool, attr)); - } else if (std::type_index(attr.type()) == std::type_index(typeid(int))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(int))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int, attr)); - } else if (std::type_index(attr.type()) == - std::type_index(typeid(int64_t))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(int64_t))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(int64_t, attr)); - } else if (std::type_index(attr.type()) == - std::type_index(typeid(float))) { + } else if (attr_defs[i].type_index == std::type_index(typeid(float))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(float, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::string))) { infer_meta_context.EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { - infer_meta_context.EmplaceBackAttr( - BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + if (std::type_index(attr.type()) == + std::type_index(typeid(std::vector))) { + // Emplace Back Attr according to the type of Phi_Kernel args. + const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); + const std::vector vector_int64_attr(vector_int_attr.begin(), + vector_int_attr.end()); + infer_meta_context.EmplaceBackAttr(vector_int64_attr); + } else { + infer_meta_context.EmplaceBackAttr( + BOOST_GET_CONST(std::vector, attr)); + } + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); - } else if (std::type_index(attr.type()) == + } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { infer_meta_context.EmplaceBackAttr( BOOST_GET_CONST(std::vector, attr)); + } else if (attr_defs[i].type_index == + std::type_index(typeid(phi::DataType))) { + auto data_type = paddle::framework::TransToPtenDataType( + static_cast( + BOOST_GET_CONST(int, attr))); + infer_meta_context.EmplaceBackAttr(data_type); } else { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported attribute type is received when call " diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc index d21232ed82296..f3dd056911ecf 100644 --- a/paddle/phi/core/infermeta_utils.cc +++ b/paddle/phi/core/infermeta_utils.cc @@ -67,6 +67,14 @@ const MetaTensor& InferMetaContext::InputAt(size_t idx) const { return *inputs_.at(idx); } +paddle::optional InferMetaContext::OptionalInputAt( + size_t idx) const { + const auto& input = inputs_.at(idx); + return input ? paddle::optional{static_cast< + const phi::MetaTensor&>(*input)} + : paddle::optional{paddle::none}; +} + std::vector InferMetaContext::InputsBetween(size_t start, size_t end) const { std::vector result; diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h index 7cf92e4d933b3..203dbb269841e 100644 --- a/paddle/phi/core/infermeta_utils.h +++ b/paddle/phi/core/infermeta_utils.h @@ -51,6 +51,9 @@ class InferMetaContext { const MetaConfig& GetMetaConfig() const; const MetaTensor& InputAt(size_t idx) const; + + paddle::optional OptionalInputAt(size_t idx) const; + std::vector InputsBetween(size_t start, size_t end) const; MetaTensor* MutableOutputAt(size_t idx); std::vector MutableOutputBetween(size_t start, size_t end); @@ -135,6 +138,24 @@ struct InferMetaFnImpl { } }; + template + struct InferMetaFnCallHelper, Tail...> { + template + static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) { + static_assert(attr_idx == 0, + "InferMeta's Input should appear before Attributes."); + static_assert(out_idx == 0, + "InferMeta's Input should appear before Outputs."); + const std::pair range = ctx->InputRangeAt(in_idx); + auto arg = ctx->OptionalInputAt(range.first); + + InferMetaFnCallHelper< + Tail...>::template Call(ctx, + pargs..., + arg); + } + }; + template struct InferMetaFnCallHelper&, Tail...> { template From 7a7a7cad9212ffc563ddcc576b40368f25d5702f Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Thu, 24 Feb 2022 11:25:11 +0800 Subject: [PATCH 22/85] [Phi] Fix XPU OP segmentation Fault problem (#39827) * [Phi] Fix XPU OP segmentation Fault problem * fix cast_op_xpu in kunlun1 * fix cast_op_xpu in kunlun1 --- paddle/fluid/framework/operator.cc | 20 ++++++++++------ paddle/fluid/imperative/prepared_operator.cc | 25 ++++++++++++-------- paddle/phi/core/compat/convert_utils.cc | 2 ++ 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 701fc7de6940a..692ebf6f332f1 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1211,7 +1211,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope, << "` not found."; } } - if (pt_kernel_->IsValid()) { +#ifdef PADDLE_WITH_XPU + bool is_xpu_unsupport = + paddle::platform::is_xpu_place(kernel_type_->place_) && + !paddle::platform::is_xpu_support_op(type_, *kernel_type_.get()) || + paddle::platform::is_in_xpu_black_list(type_); +#endif + if (pt_kernel_->IsValid() +#ifdef PADDLE_WITH_XPU + && !is_xpu_unsupport +#endif + ) { run_pten_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); @@ -1220,13 +1230,9 @@ void OperatorWithKernel::RunImpl(const Scope& scope, kernels_iter->second.find(*kernel_type_.get()) == kernels_iter->second.end() #ifdef PADDLE_WITH_XPU - || - paddle::platform::is_xpu_place(kernel_type_->place_) && // NOLINT - !paddle::platform::is_xpu_support_op( - type_, *kernel_type_.get()) // NOLINT - || paddle::platform::is_in_xpu_black_list(type_) + || is_xpu_unsupport #endif - ) { + ) { auto pt_cpu_kernel_key = FallBackToCpu(*kernel_type_.get(), pt_kernel_key, *this); pt_kernel_.reset( diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 05218ba961fdd..6d18b0a86f091 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -161,6 +161,13 @@ PreparedOp PrepareImpl(const NameVarMap& ins, framework::KernelSignature pt_kernel_signature; phi::KernelKey pt_kernel_key; std::string pt_kernel_name; +#ifdef PADDLE_WITH_XPU + bool is_xpu_unsupport = + paddle::platform::is_xpu_place(expected_kernel_key.place_) && + !paddle::platform::is_xpu_support_op(op.Type(), + expected_kernel_key) || + paddle::platform::is_in_xpu_black_list(op.Type()); +#endif if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx); VLOG(6) << pt_kernel_signature; @@ -170,7 +177,11 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, pt_kernel_key); - if (pt_kernel.IsValid()) { + if (pt_kernel.IsValid() +#ifdef PADDLE_WITH_XPU + && !is_xpu_unsupport +#endif + ) { VLOG(6) << "Dynamic mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << pt_kernel; @@ -197,13 +208,9 @@ PreparedOp PrepareImpl(const NameVarMap& ins, kernels_iter->second.find(expected_kernel_key) == kernels_iter->second.end()) #ifdef PADDLE_WITH_XPU - || - paddle::platform::is_xpu_place(expected_kernel_key.place_) && - !paddle::platform::is_xpu_support_op(op.Type(), - expected_kernel_key) || - paddle::platform::is_in_xpu_black_list(op.Type()) + || is_xpu_unsupport #endif - ) { + ) { if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); @@ -230,9 +237,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #ifdef PADDLE_WITH_XPU if (paddle::platform::is_xpu_place(expected_kernel_key.place_) && - (kernel_iter == kernels.end() || - !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key) || - paddle::platform::is_in_xpu_black_list(op.Type()))) { + (kernel_iter == kernels.end() || is_xpu_unsupport)) { VLOG(3) << "missing XPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index b4e7e127995ec..a5b7b869b948d 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -30,6 +30,8 @@ Backend TransToPtenBackend(const phi::Place& place) { return Backend::CPU; } else if (place.GetType() == phi::AllocationType::GPU) { return Backend::GPU; + } else if (place.GetType() == phi::AllocationType::XPU) { + return Backend::XPU; } else if (place.GetType() == phi::AllocationType::CUSTOM) { return static_cast( static_cast(Backend::NUM_BACKENDS) + From 4e26fa577198e2ab978ef0091329ff5a6b2e707a Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 24 Feb 2022 11:27:33 +0800 Subject: [PATCH 23/85] fix 'invalid escape sequence' (#39842) * fix 'invalid escape sequence' * fix assert error --- python/paddle/distributed/auto_parallel/cost_model.py | 4 ++-- python/paddle/incubate/nn/functional/fused_transformer.py | 4 ++-- python/paddle/nn/functional/loss.py | 2 +- python/paddle/nn/functional/pooling.py | 6 +++--- python/paddle/nn/initializer/dirac.py | 2 +- python/paddle/nn/layer/common.py | 2 +- python/paddle/nn/layer/pooling.py | 6 +++--- python/paddle/signal.py | 4 ++-- python/paddle/tensor/random.py | 4 ++-- python/paddle/vision/ops.py | 4 ++-- 10 files changed, 19 insertions(+), 19 deletions(-) diff --git a/python/paddle/distributed/auto_parallel/cost_model.py b/python/paddle/distributed/auto_parallel/cost_model.py index 1155c2817a21c..b72c044428f6c 100644 --- a/python/paddle/distributed/auto_parallel/cost_model.py +++ b/python/paddle/distributed/auto_parallel/cost_model.py @@ -426,7 +426,7 @@ def _merge_node(self, to_merge_node_list, merge_type='linear', nodes=None): return merged_node_id, merged_node def merge_linear(self): - ''' + r''' This method does the following: If X depends on Y only, they must be run sequentially. [ e.g. A ->- C ->- D D and E depends on C only.] @@ -442,7 +442,7 @@ def merge_linear(self): return cnt def merge_branch(self): - ''' + r''' This method does the following: If a node has more than one successor, there is *branch*. [ e.g. A ->- B ->- D ] diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py index 3569d372fa6dc..d600cda8454cc 100644 --- a/python/paddle/incubate/nn/functional/fused_transformer.py +++ b/python/paddle/incubate/nn/functional/fused_transformer.py @@ -46,7 +46,7 @@ def fused_feedforward(x, training=True, mode='upscale_in_train', name=None): - """ + r""" This is a fusion operator to compute feed forward layer in transformer model architecture. This operator only supports running on GPU. The function of the operator is consistent with the following pseudo code: @@ -230,7 +230,7 @@ def fused_multi_head_attention(x, training=True, mode='upscale_in_train', name=None): - """ + r""" Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending to information from different representation subspaces. This API only diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index 94c516f476ede..e59ef5ebfb0ab 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1113,7 +1113,7 @@ def margin_cross_entropy(logits, group=None, return_softmax=False, reduction='mean'): - """ + r""" .. math:: L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}} diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py index a528a72ec5cac..34a0159fbb0dc 100755 --- a/python/paddle/nn/functional/pooling.py +++ b/python/paddle/nn/functional/pooling.py @@ -670,7 +670,7 @@ def max_unpool1d(x, data_format="NCL", output_size=None, name=None): - """ + r""" This API implements max unpooling 1d opereation. `max_unpool1d` accepts the output of `max_pool1d` as input, including the indices of the maximum value and calculate the partial inverse. @@ -779,7 +779,7 @@ def max_unpool2d(x, data_format="NCHW", output_size=None, name=None): - """ + r""" This API implements max unpooling 2d opereation. See more details in :ref:`api_nn_pooling_MaxUnPool2D` . @@ -894,7 +894,7 @@ def max_unpool3d(x, data_format="NCDHW", output_size=None, name=None): - """ + r""" This API implements max unpooling 3d opereation. `max_unpool3d` accepts the output of `max_pool3d` as input, including the indices of the maximum value and calculate the partial inverse. diff --git a/python/paddle/nn/initializer/dirac.py b/python/paddle/nn/initializer/dirac.py index 514afb15a8edb..da3266ab33694 100644 --- a/python/paddle/nn/initializer/dirac.py +++ b/python/paddle/nn/initializer/dirac.py @@ -23,7 +23,7 @@ class Dirac(Initializer): - """Initialize the 3D/4D/5D Tensor with Dirac delta function. + r"""Initialize the 3D/4D/5D Tensor with Dirac delta function. It can reserve the feature of convolution layer input, which means that as many channels are reserved as possible. diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py index 9ae9d5bec437e..19fbcd5b6f856 100644 --- a/python/paddle/nn/layer/common.py +++ b/python/paddle/nn/layer/common.py @@ -1554,7 +1554,7 @@ def extra_repr(self): class Fold(Layer): - """ + r""" This Op is used to combines an array of sliding local blocks into a large containing tensor. also known as col2im when operated on batched 2D image tensor. Fold calculates each diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 96942f5c8500a..68808c6354afb 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -1131,7 +1131,7 @@ def extra_repr(self): class MaxUnPool1D(Layer): - """ + r""" This API implements max unpooling 1d opereation. `max_unpool1d` accepts the output of `max_pool1d` as input, @@ -1213,7 +1213,7 @@ def extra_repr(self): class MaxUnPool2D(Layer): - """ + r""" This API implements max unpooling 2d opereation. 'max_unpool2d' accepts the output of 'max_unpool2d' as input @@ -1299,7 +1299,7 @@ def extra_repr(self): class MaxUnPool3D(Layer): - """ + r""" This API implements max unpooling 3d opereation. `max_unpool3d` accepts the output of `max_pool3d` as input, diff --git a/python/paddle/signal.py b/python/paddle/signal.py index fc80c7cbc80f3..cd8ba2b58a8c9 100644 --- a/python/paddle/signal.py +++ b/python/paddle/signal.py @@ -243,7 +243,7 @@ def stft(x, normalized=False, onesided=True, name=None): - """ + r""" Short-time Fourier transform (STFT). The STFT computes the discrete Fourier transforms (DFT) of short overlapping @@ -398,7 +398,7 @@ def istft(x, length=None, return_complex=False, name=None): - """ + r""" Inverse short-time Fourier transform (ISTFT). Reconstruct time-domain signal from the giving complex input and window tensor when diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py index c4e7e96191acf..660803f9f7475 100644 --- a/python/paddle/tensor/random.py +++ b/python/paddle/tensor/random.py @@ -81,7 +81,7 @@ def bernoulli(x, name=None): def poisson(x, name=None): - """ + r""" This OP returns a tensor filled with random number from a Poisson Distribution. .. math:: @@ -984,7 +984,7 @@ def rand(shape, dtype=None, name=None): def exponential_(x, lam=1.0, name=None): - """ + r""" This inplace OP fill input Tensor ``x`` with random number from a Exponential Distribution. ``lam`` is :math:`\lambda` parameter of Exponential Distribution. diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index 68cd3ae72a6aa..03060e92bdb69 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -949,8 +949,8 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): if isinstance(output_size, int): output_size = (output_size, output_size) pooled_height, pooled_width = output_size - assert (len(x.shape) == 4, - "Input features with shape should be (N, C, H, W)") + assert len(x.shape) == 4, \ + "Input features with shape should be (N, C, H, W)" output_channels = int(x.shape[1] / (pooled_height * pooled_width)) if in_dygraph_mode(): return _C_ops.psroi_pool(x, boxes, boxes_num, "output_channels", From 1abfc8dda8501e37c6f7ffd82d0dabc616563fba Mon Sep 17 00:00:00 2001 From: Zhanlue Yang Date: Thu, 24 Feb 2022 11:47:40 +0800 Subject: [PATCH 24/85] Refactored GradNodeAccumulation data structure and behaviour (#39526) * Refactored GradNodeAccumulation data structure and behaviour * Fixed CI issues * Fix compilation issues * Fixed minor issues * Reverted changes for intermediate and OverwriteOutput * fixed minor issue * Fixed code format issues * Fixed CI-Coverage issue * Fixed CI issues --- .../eager/accumulation/accumulation_node.cc | 23 ++--- .../eager/accumulation/accumulation_node.h | 13 ++- paddle/fluid/eager/api/utils/hook_utils.cc | 32 +++---- paddle/fluid/eager/api/utils/tensor_utils.cc | 2 +- .../auto_code_generator/eager_generator.cc | 4 + paddle/fluid/eager/autograd_meta.h | 1 + paddle/fluid/eager/grad_node_info.cc | 4 +- paddle/fluid/eager/tensor_wrapper.h | 15 ++-- .../accumulation_node_test.cc | 88 +++++++++++-------- .../eager/tests/task_tests/backward_test.cc | 63 +++++-------- .../cross_batch_accumulation_test.cc | 48 +++++----- .../fluid/eager/tests/task_tests/hook_test.cc | 51 ++++------- paddle/fluid/eager/utils.cc | 15 +++- paddle/fluid/eager/utils.h | 2 + paddle/fluid/pybind/eager.cc | 3 +- paddle/fluid/pybind/eager_method.cc | 41 ++++----- paddle/fluid/pybind/eager_properties.cc | 44 ++++------ .../paddle/fluid/tests/unittests/op_test.py | 9 +- 18 files changed, 215 insertions(+), 243 deletions(-) diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 9b0e784c0efb1..2e377e43ca3ec 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -25,6 +25,8 @@ #include "glog/logging.h" +namespace egr { + static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, const paddle::experimental::Tensor& t) { if (!tensor->defined() || !tensor->initialized()) { @@ -36,14 +38,6 @@ static void CopyOrAddTensor(paddle::experimental::Tensor* tensor, } } -namespace egr { - -void GradNodeAccumulation::RetainGrad( - const std::function& hook) { - retain_grad_hook_ = hook; -} - std::vector> GradNodeAccumulation:: operator()( const std::vector>& grads) { @@ -59,17 +53,18 @@ operator()( "However received: %d in slot %d .", grads[0].size(), 0)); // Apply Gradient Hooks + paddle::experimental::Tensor grad_out; if (GradientHooksRegistered()) { std::vector> hooked_grads = ApplyGradientHooks(grads); - // TODO(jiabin): It's little weird - CopyOrAddTensor(&accumulated_grad, hooked_grads[0][0]); + grad_out = hooked_grads[0][0]; } else { - CopyOrAddTensor(&accumulated_grad, grads[0][0]); + grad_out = grads[0][0]; } - if (retain_grad_hook_ != nullptr) { - retain_grad_hook_(accumulated_grad); + if (!weak_grad_.expired()) { + auto grad = weak_grad_.lock(); + CopyOrAddTensor(grad.get(), grad_out); } // Apply Reduce Hooks @@ -77,7 +72,7 @@ operator()( ApplyReduceHooks(); } - return {{accumulated_grad}}; + return {{grad_out}}; } void GradNodeAccumulation::RegisterReduceHook( diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 3f53517204a5a..787149ab30526 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -14,6 +14,7 @@ #pragma once +#include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" namespace egr { @@ -21,7 +22,10 @@ namespace egr { class GradNodeAccumulation : public GradNodeBase { public: // Constructor: configure fwd input tensors to grad node - GradNodeAccumulation() : GradNodeBase(1, 1) { SetDefaultGradInOutMeta(); } + explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) { + weak_grad_ = meta->WeakGrad(); + SetDefaultGradInOutMeta(); + } ~GradNodeAccumulation() override = default; @@ -30,11 +34,6 @@ class GradNodeAccumulation : public GradNodeBase { const std::vector>& grads) override; - void RetainGrad(const std::function& hook); - - paddle::experimental::Tensor* Grad() { return &accumulated_grad; } - std::string name() { return "GradNodeAccumulation"; } /** @@ -49,7 +48,7 @@ class GradNodeAccumulation : public GradNodeBase { void ApplyReduceHooks(); private: - paddle::experimental::Tensor accumulated_grad; + std::weak_ptr weak_grad_; std::function diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 7d2997eb884c8..748afe6d1f313 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -52,9 +52,15 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, } } -void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { - // TODO(jiabin): Support More Tensor type here +static void RetainGradForRegularNode( + const paddle::experimental::Tensor& tensor) { AutogradMeta* meta = EagerUtils::unsafe_autograd_meta(tensor); + if (meta->RetainGrads()) { + return; + } else { + meta->SetRetainGrads(true); + } + std::weak_ptr weak_grad_tensor = meta->WeakGrad(); @@ -79,21 +85,17 @@ void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { } }; - if (IsLeafTensor(tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = EagerUtils::grad_node(tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->RetainGrad(hook); + // Append to GradientHooks + RegisterGradientHookForTensor(tensor, hook); +} +void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { + if (IsLeafTensor(tensor)) { + // Leaf tensor's grad will always be retained + // Refer to implementation of AccumulationNode for more details + return; } else { - // Append to GradientHooks - RegisterGradientHookForTensor(tensor, hook); + RetainGradForRegularNode(tensor); } } diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index c06edef7017be..628c0c500b3c4 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -47,7 +47,7 @@ paddle::experimental::Tensor CreateTensorWithValue( auto meta = EagerUtils::autograd_meta(&out); if (is_leaf) { - auto accumulation_node = std::make_shared(); + auto accumulation_node = std::make_shared(meta); meta->SetGradNode(accumulation_node); meta->SetStopGradient(false); } diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 7cddfd9c1c7dc..e1f4d6ee9a129 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -1031,6 +1031,8 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_name = output.name(); const std::string& output_autograd_name = "p_autograd_" + output_name; + // Skip Intermediate Tensor + if (output.duplicable()) { const char* GET_MULTI_AUTOGRAD_META_TEMPLATE = " std::vector %s = " @@ -1145,6 +1147,8 @@ static std::string GenerateGradNodeCreationContent( const std::string& output_autograd_name = "p_autograd_" + output_name; size_t output_position = fwd_outputs_name_pos_map.at(output_name); + // Intermediate Tensor does not require SetHistory, nor RetainGrad + if (output.duplicable()) { pass_stop_gradient_args += ", &" + output_autograd_name; const char* SET_OUT_RANK_TEMPLATE = diff --git a/paddle/fluid/eager/autograd_meta.h b/paddle/fluid/eager/autograd_meta.h index 53f17a4ffe58c..9e1dc4f2c8c6b 100644 --- a/paddle/fluid/eager/autograd_meta.h +++ b/paddle/fluid/eager/autograd_meta.h @@ -97,6 +97,7 @@ class AutogradMeta : public AbstractAutogradMeta { "Should Not set NULL as GradNode pointer, since " "our default Edge and autogradMeta has nullptr for " "grad node. Set Nullptr will lead error.")); + grad_node_ = grad_node; } diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index d83fa916db66c..27c376b4c80c6 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -53,7 +53,7 @@ void GradNodeBase::AddEdges(std::vector* metas, size_t slot_id) { adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } @@ -76,7 +76,7 @@ void GradNodeBase::AddEdges(AutogradMeta* meta, size_t slot_id) { adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), meta->OutRankInfo()); } else { - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); VLOG(6) << "Add Edges for slot: " << slot_id << ", the Edge is from " << this->name() << " to " << meta->GetMutableGradNode()->name(); adj_edges_[slot_id].emplace_back(meta->GetMutableGradNode(), diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h index 1732e0513d524..31aaa93c41643 100644 --- a/paddle/fluid/eager/tensor_wrapper.h +++ b/paddle/fluid/eager/tensor_wrapper.h @@ -66,14 +66,13 @@ class TensorWrapper { } intermidiate_tensor_.set_name(tensor.name() + "@Saved"); - PADDLE_ENFORCE_NOT_NULL( - EagerUtils::unsafe_autograd_meta(tensor), - paddle::platform::errors::Fatal( - "Full reserved Tensor should not have null autograd meta, since " - "tensor_wrapper is used to build backward info. There is no way " - "for us to build it with null autograd_meta.")); - // copy output_rank - out_rank_info_ = EagerUtils::OutRankInfo(tensor); + + // If an output is marked "intermedaite", we won't create + // autograd_meta for it. + // In that case, simply skip OutRankInfo Copy + if (EagerUtils::nullable_autograd_meta(tensor)) { + out_rank_info_ = EagerUtils::OutRankInfo(tensor); + } } paddle::experimental::Tensor recover( diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index 682e55e7d9294..880bd26841027 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -17,11 +17,13 @@ #include "gtest/gtest.h" #include "paddle/fluid/eager/accumulation/accumulation_node.h" +#include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" #include "paddle/fluid/eager/grad_tensor_holder.h" -#include "paddle/phi/api/lib/utils/allocator.h" +#include "paddle/fluid/eager/utils.h" +#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" // TODO(jiabin): remove nolint here!!! @@ -37,7 +39,7 @@ TEST(AccumulationNode, Tensor) { .get(), meta); dt0->mutable_data( - paddle::platform::CPUPlace())[0] = 10.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(10.0f); paddle::experimental::Tensor et0 = paddle::experimental::Tensor(dt0); std::shared_ptr dt1 = std::make_shared( @@ -47,84 +49,100 @@ TEST(AccumulationNode, Tensor) { meta); dt1->mutable_data( - paddle::platform::CPUPlace())[0] = 20.0; + paddle::platform::CPUPlace())[0] = paddle::platform::float16(20.0f); paddle::experimental::Tensor et1 = paddle::experimental::Tensor(dt1); + std::shared_ptr input_dt = + std::make_shared( + std::make_unique( + paddle::platform::CPUPlace()) + .get(), + meta); + paddle::experimental::Tensor input_et = + paddle::experimental::Tensor(input_dt); + auto grad_meta = EagerUtils::autograd_meta(&input_et); + + // Initialize Grad Tensor std::shared_ptr grad_dt = std::make_shared( std::make_unique( paddle::platform::CPUPlace()) .get(), meta); - paddle::experimental::Tensor grad_et = paddle::experimental::Tensor(grad_dt); + grad_dt->mutable_data( + paddle::platform::CPUPlace())[0] = paddle::platform::float16(0.0f); + grad_meta->MutableGrad()->set_impl(grad_dt); // AccumulationNode - GradNodeAccumulation node = GradNodeAccumulation(); - - // Hook, RetainGrad - std::function - hook = [&grad_et](const paddle::experimental::Tensor& t) { - grad_et.set_impl(t.impl()); - return grad_et; - }; - node.RetainGrad(hook); + auto node = std::make_shared(grad_meta); + grad_meta->SetGradNode(node); + grad_meta->SetStopGradient(false); // operator() - paddle::experimental::Tensor ret_et0 = node({{et0}})[0][0]; + paddle::experimental::Tensor ret_et0 = node->operator()({{et0}})[0][0]; auto* ret_et0_ptr = std::dynamic_pointer_cast(ret_et0.impl()) ->data(); CHECK_EQ(ret_et0_ptr[0], paddle::platform::float16(10.0f)); - paddle::experimental::Tensor ret_et1 = node({{et1}})[0][0]; + paddle::experimental::Tensor ret_et1 = node->operator()({{et1}})[0][0]; + auto* ret_et1_ptr = std::dynamic_pointer_cast(ret_et1.impl()) ->data(); - CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(30.0f)); + CHECK_EQ(ret_et1_ptr[0], paddle::platform::float16(20.0f)); - // Retain Grad - auto* ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - CHECK_EQ(ret_grad_et_ptr[0], paddle::platform::float16(30.0f)); + // Check Retain Grad + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + paddle::experimental::Tensor* grad = EagerUtils::mutable_grad(input_et); + auto* grad_ptr = std::dynamic_pointer_cast(grad->impl()) + ->data(); + CHECK_EQ(grad_ptr[0], paddle::platform::float16(30.0f)); // Reduce Hook case 1: Call RegisterReduceHook and run operator() VLOG(6) << "Test Reduce Hook"; + CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) + ->data()[0], + paddle::platform::float16(10.0f)); + auto reduce_hook_1 = [&](void) -> void { - auto* grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) - ->data(); - grad_et_ptr[0] = 36.0; + auto* input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) + ->mutable_data( + paddle::platform::CPUPlace()); + input_et_ptr[0] = 36.0; VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_1); + node->RegisterReduceHook(reduce_hook_1); // operator() - paddle::experimental::Tensor _ret = node({{et0}})[0][0]; + paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; // Check operator() result, should be 36.0 auto* _ret_ptr = std::dynamic_pointer_cast(_ret.impl()) ->data(); - CHECK_EQ(_ret_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_ptr[0], paddle::platform::float16(10.0f)); // Check Retain Grad, should be 36.0 - auto* _ret_grad_et_ptr = - std::dynamic_pointer_cast(grad_et.impl()) + auto* _ret_input_et_ptr = + std::dynamic_pointer_cast(input_et.impl()) ->data(); - CHECK_EQ(_ret_grad_et_ptr[0], paddle::platform::float16(36.0f)); + CHECK_EQ(_ret_input_et_ptr[0], paddle::platform::float16(36.0f)); // Reduce Hook case 2: Call RegisterReduceHook and ApplyReduceHooks directly VLOG(6) << "Test Reduce Hook"; auto reduce_hook_2 = [&](void) -> void { auto* ret_et0_ptr = std::dynamic_pointer_cast(et0.impl()) - ->data(); + ->mutable_data( + paddle::platform::CPUPlace()); ret_et0_ptr[0] = 100.0; // set to 100.0 VLOG(6) << "Running Reduce Hook"; }; - node.RegisterReduceHook(reduce_hook_2); - node.ApplyReduceHooks(); + node->RegisterReduceHook(reduce_hook_2); + node->ApplyReduceHooks(); // Check ApplyReduceHooks result CHECK_EQ(std::dynamic_pointer_cast(et0.impl()) diff --git a/paddle/fluid/eager/tests/task_tests/backward_test.cc b/paddle/fluid/eager/tests/task_tests/backward_test.cc index 771b324a69b5a..a4bc56bd606f3 100644 --- a/paddle/fluid/eager/tests/task_tests/backward_test.cc +++ b/paddle/fluid/eager/tests/task_tests/backward_test.cc @@ -59,22 +59,18 @@ TEST(Backward, SingleNodeEmptyGrad) { auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta1->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } std::vector outs = {target_tensor}; @@ -123,22 +119,17 @@ TEST(Backward, SingleNodeCustomGrad) { std::dynamic_pointer_cast(node0_ptr)); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); auto_grad_meta->SetStopGradient(false); - // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); + // Connect Tensor and AccumulationNode via AutoGradMeta + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); + auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node0 -> AccumulationNode via Edge - auto meta = egr::AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; + auto_grad_meta1->SetStopGradient(false); + std::vector res = {auto_grad_meta1}; node0_ptr->AddEdges(&res, 0); } @@ -201,22 +192,17 @@ TEST(Backward, LinearNodes) { std::vector res0 = {&meta0}; node0_ptr->AddEdges(&res0, 0); + AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta1); - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta1->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node1 -> AccumulationNode via Edge - auto meta1 = egr::AutogradMeta(); - meta1.SetStopGradient(false); - meta1.SetSingleOutRankWithSlot(0, 0); - meta1.SetGradNode(acc_node_ptr); - std::vector res1 = {&meta1}; + auto_grad_meta1->SetStopGradient(false); + std::vector res1 = {auto_grad_meta1}; node1_ptr->AddEdges(&res1, 0); } @@ -311,22 +297,17 @@ TEST(Backward, WithAccumulation) { std::vector res1 = {&meta1}; node1_ptr->AddEdges(&res1, 0); + AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); // Connect Tensor and AccumulationNode via AutoGradMeta - auto acc_node_ptr = std::make_shared(); + auto acc_node_ptr = + std::make_shared(auto_grad_meta2); - AutogradMeta* auto_grad_meta2 = EagerUtils::autograd_meta(&leaf_tensor); auto_grad_meta2->SetGradNode( std::dynamic_pointer_cast(acc_node_ptr)); auto_grad_meta2->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - - // Connect Node2 -> AccumulationNode via Edge - auto meta2 = egr::AutogradMeta(); - meta2.SetStopGradient(false); - meta2.SetSingleOutRankWithSlot(0, 0); - meta2.SetGradNode(acc_node_ptr); - std::vector res2 = {&meta2}; + auto_grad_meta2->SetStopGradient(false); + std::vector res2 = {auto_grad_meta2}; node2_ptr->AddEdges(&res2, 0); } diff --git a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc index a44ca6fcffbff..524872b2e5563 100644 --- a/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc +++ b/paddle/fluid/eager/tests/task_tests/cross_batch_accumulation_test.cc @@ -46,34 +46,26 @@ TEST(CrossBatchAccumulation, SingleScaleNode) { paddle::experimental::Tensor& target_tensor = target_tensors[0]; paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); - { - auto scale_node_ptr = std::make_shared(1, 1); - scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); - - scale_node_ptr->SetDefaultGradInOutMeta(); - - auto acc_node_ptr = std::make_shared(); - - AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(scale_node_ptr)); - auto_grad_meta->SetSingleOutRankWithSlot(0, 0); - auto_grad_meta->SetStopGradient(false); - egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 - - auto meta = AutogradMeta(); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetStopGradient(false); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - - AutogradMeta* auto_grad_meta1 = EagerUtils::autograd_meta(&leaf_tensor); - auto_grad_meta1->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); - auto_grad_meta1->SetSingleOutRankWithSlot(0, 0); - egr_utils_api::RetainGradForTensor(leaf_tensor); - } + + auto scale_node_ptr = std::make_shared(1, 1); + scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); + + scale_node_ptr->SetDefaultGradInOutMeta(); + + AutogradMeta* auto_grad_meta = EagerUtils::autograd_meta(&target_tensor); + auto_grad_meta->SetGradNode( + std::dynamic_pointer_cast(scale_node_ptr)); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + auto_grad_meta->SetStopGradient(false); + egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 + + AutogradMeta* meta = EagerUtils::autograd_meta(&leaf_tensor); + auto acc_node_ptr = std::make_shared(meta); + meta->SetStopGradient(false); + meta->SetSingleOutRankWithSlot(0, 0); + meta->SetGradNode(acc_node_ptr); + std::vector res = {meta}; + scale_node_ptr->AddEdges(&res, 0); RunBackward(target_tensors, {}); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index bf2f620dd19ba..fbc71168fe416 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -79,9 +79,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); - // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad { @@ -102,16 +99,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); egr_utils_api::RetainGradForTensor( target_tensor); // result: 1.0 + 3.0 = 4.0 - } - - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); + egr_utils_api::RetainGradForTensor( + target_tensor); // result: 1.0 + 3.0 = 4.0 } // Retain Grad for leaf tensor1 @@ -123,9 +112,16 @@ TEST(RetainGrad, HookBeforeRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + + auto_grad_meta->SetStopGradient(false); + auto_grad_meta->SetGradNode(acc_node_ptr); auto_grad_meta->SetSingleOutRankWithSlot(0, 0); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); @@ -160,8 +156,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { scale_node_ptr->SetAttributes_scale(5.0 /*scale*/); // Set grad in/out meta for node0 scale_node_ptr->SetDefaultGradInOutMeta(); - // Create AccumulationNode - auto acc_node_ptr = std::make_shared(); // Connect Input Tensor and ScaleNode via AutoGradMeta // Apply RetainGrad @@ -184,16 +178,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); } - // Connect ScaleNode -> AccumulationNode via Edge - { - auto meta = AutogradMeta(); - meta.SetStopGradient(false); - meta.SetSingleOutRankWithSlot(0, 0); - meta.SetGradNode(acc_node_ptr); - std::vector res = {&meta}; - scale_node_ptr->AddEdges(&res, 0); - } - // Retain Grad for leaf tensor1 paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { @@ -203,17 +187,18 @@ TEST(RetainGrad, HookAfterRetainGrad) { hook = &hook_function; auto auto_grad_meta = std::make_shared(); - auto_grad_meta->SetGradNode( - std::dynamic_pointer_cast(acc_node_ptr)); + auto acc_node_ptr = + std::make_shared(auto_grad_meta.get()); + auto_grad_meta->SetGradNode(acc_node_ptr); + auto_grad_meta->SetStopGradient(false); + std::vector res = {auto_grad_meta.get()}; + scale_node_ptr->AddEdges(&res, 0); + auto_grad_meta->SetSingleOutRankWithSlot(0, 0); leaf_tensor.set_autograd_meta( std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RetainGradForTensor( - leaf_tensor); // RetainGrad for leaf tensor gets - // postponed, result: 4.0*5.0 + 3.0 = - // 23.0 egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); } diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 5d8dff5cd5b24..7464ad7413585 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/eager/api/utils/hook_utils.h" #include "paddle/fluid/eager/tensor_wrapper.h" @@ -21,7 +22,6 @@ #include "paddle/phi/common/layout.h" #include "paddle/phi/core/tensor_meta.h" -#include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/variable.h" @@ -109,6 +109,16 @@ std::shared_ptr EagerUtils::grad_node( } } +paddle::experimental::Tensor* EagerUtils::mutable_grad( + const paddle::experimental::Tensor& target) { + auto* meta = nullable_autograd_meta(target); + if (meta) { + return meta->MutableGrad(); + } else { + return nullptr; + } +} + void EagerUtils::SetHistory(std::vector* autograd_metas, const std::shared_ptr& grad_node) { for (const auto& autograd_meta : *autograd_metas) { @@ -342,7 +352,8 @@ std::shared_ptr EagerUtils::GetGradAccumulationNode( } else { if (!autograd_ptr->StopGradient()) { VLOG(6) << "Add GradNodeAccumulation for tensor: " << tensor.name(); - autograd_ptr->SetGradNode(std::make_shared()); + autograd_ptr->SetGradNode( + std::make_shared(autograd_ptr)); return autograd_ptr->GetMutableGradNode(); } else { return nullptr; diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h index b74d68db2a6d5..fa5735e6f32a0 100644 --- a/paddle/fluid/eager/utils.h +++ b/paddle/fluid/eager/utils.h @@ -102,6 +102,8 @@ class EagerUtils { static std::shared_ptr grad_node( const paddle::experimental::Tensor& target); + static paddle::experimental::Tensor* mutable_grad( + const paddle::experimental::Tensor& target); // Set history is used to set backward info during forward process, it will // set forward var's autograd meta's grad node as current backward node. diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index 2296169a16104..d9a2dcb686909 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -86,7 +86,8 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, if (!autograd_meta->GetMutableGradNode()) { VLOG(3) << "Tensor(" << name << ") have not GradNode, add GradNodeAccumulation for it."; - autograd_meta->SetGradNode(std::make_shared()); + autograd_meta->SetGradNode( + std::make_shared(autograd_meta)); } } diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 27328bea692af..4e900ae2ffbc1 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -177,7 +177,7 @@ static PyObject* tensor_retain_grads(TensorObject* self, PyObject* args, if (!meta->GetMutableGradNode()) { VLOG(6) << "Make grad node of tensor: " << self->tensor.name() << "become accumulation node"; - meta->SetGradNode(std::make_shared()); + meta->SetGradNode(std::make_shared(meta)); } egr::egr_utils_api::RetainGradForTensor(self->tensor); } @@ -199,17 +199,12 @@ static PyObject* tensor_clear_gradient(TensorObject* self, PyObject* args, paddle::experimental::Tensor* grad; if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - grad = accumulation_grad_node->Grad(); + grad = egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); grad = meta->MutableGrad(); @@ -248,19 +243,15 @@ static PyObject* tensor__zero_grads(TensorObject* self, PyObject* args, if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { // Add RetainGrad as PostHook to AccumulationNode - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - if (accumulation_grad_node->Grad()->initialized()) { - accumulation_grad_node->Grad()->set_impl( - paddle::experimental::zeros_like(*(accumulation_grad_node->Grad())) - .impl()); + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + if (grad->initialized()) { + grad->set_impl(paddle::experimental::zeros_like(*(grad)).impl()); } } else { auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor); diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc index 43cfb50f2afe1..2e1390cb96155 100644 --- a/paddle/fluid/pybind/eager_properties.cc +++ b/paddle/fluid/pybind/eager_properties.cc @@ -70,26 +70,13 @@ PyObject* tensor_properties_get_stop_gradient(TensorObject* self, PyObject* tensor_properties_get_grad(TensorObject* self, void* closure) { EAGER_TRY - if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - return ToPyObject(*accumulation_grad_node->Grad()); + VLOG(6) << "Get grad for tensor: " << self->tensor.name(); + auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); + if (meta) { + return ToPyObject(meta->Grad()); } else { - VLOG(6) << "Get grad for tensor: " << self->tensor.name(); - auto meta = egr::EagerUtils::nullable_autograd_meta(self->tensor); - if (meta) { - return ToPyObject(meta->Grad()); - } else { - Py_INCREF(Py_None); - return Py_None; - } + Py_INCREF(Py_None); + return Py_None; } EAGER_CATCH_AND_THROW_RETURN_NULL } @@ -101,16 +88,15 @@ int tensor_properties_set_grad(TensorObject* self, PyObject* value, PADDLE_ENFORCE( egr::egr_utils_api::IsLeafTensor(self->tensor), paddle::platform::errors::Fatal("Only leaf Tensor can be set grad.")); - std::shared_ptr grad_node = - egr::EagerUtils::grad_node(self->tensor); - PADDLE_ENFORCE( - grad_node.get() != nullptr, - paddle::platform::errors::Fatal("Detected NULL grad_node" - "Leaf tensor should have had grad_node " - "with type: GradNodeAccumulation")); - auto accumulation_grad_node = - std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->Grad()->copy_(src, true); + + paddle::experimental::Tensor* grad = + egr::EagerUtils::mutable_grad(self->tensor); + PADDLE_ENFORCE(grad != nullptr, + paddle::platform::errors::Fatal( + "Detected NULL grad" + "Please check if you have manually cleared" + "the grad inside autograd_meta")); + grad->copy_(src, true); return 0; EAGER_CATCH_AND_THROW_RETURN_ZERO } diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 92cba4fca5aba..848ebae0706e3 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -606,8 +606,12 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict): if is_input: v = self._create_var_from_numpy(np_value_temp) + if if_return_inputs_grad_dict: v.stop_gradient = False + if _in_eager_mode(): + v.retain_grads() + if has_lod: v.value().get_tensor().set_recursive_sequence_lengths( lod_temp) @@ -618,7 +622,6 @@ def create_var(np_value, name, is_input, if_return_inputs_grad_dict): type=core.VarDesc.VarType.LOD_TENSOR, persistable=False, stop_gradient=False) - return v # prepare variable for input or output @@ -681,7 +684,6 @@ def _calc_dygraph_output(self, place, parallel=False, no_check_set=None): # prepare input variable inputs = self.append_input_output_for_dygraph(op_proto, self.inputs, True, False, block) - # prepare output variable outputs = self.append_input_output_for_dygraph( op_proto, self.outputs, False, False, block) @@ -1741,6 +1743,7 @@ def _get_dygraph_grad(self, for attrs_name in self.attrs: if self.attrs[attrs_name] is not None: attrs_outputs[attrs_name] = self.attrs[attrs_name] + block.append_op( type=self.op_type, inputs=inputs, @@ -1817,7 +1820,9 @@ def _get_dygraph_grad(self, inputs={"X": loss_sum}, outputs={"Out": loss}, attrs={'scale': 1.0 / float(len(avg_sum))}) + loss.backward() + fetch_list_grad = [] for inputs_to_check_name in inputs_to_check: a = inputs_grad_dict[inputs_to_check_name].gradient() From 2ec943a7d29cace9ac5b36d3d6c9da3fedd99da5 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 24 Feb 2022 05:43:43 +0100 Subject: [PATCH 25/85] Added nearest interp v2 BF16 FWD kernel (#39490) * added nearest interp v2 bf16 * disabled bilinear interp nhwc test * added skipping UT for gpu * added NHWC support * removed unnecessary statements * minor change * CI fix * added appropriate changes to interpolate_v1 * fix after review * minor change * minor change * revert unwanted deletions * CI fix --- paddle/fluid/operators/interpolate_op.cc | 2 +- paddle/fluid/operators/interpolate_v2_op.cc | 2 +- .../operators/mkldnn/interpolate_mkldnn_op.cc | 34 ++++++++----------- .../test_nearest_interp_v2_mkldnn_op.py | 26 +++++++++----- 4 files changed, 34 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 8fac84176d97f..fda168c94e1e0 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -328,7 +328,7 @@ class InterpolateOp : public framework::OperatorWithKernel { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - auto interp_method = ctx.Attr("interp_method"); + const auto& interp_method = ctx.Attr("interp_method"); // TODO(danqing): support other interp_method if (this->CanMKLDNNBeUsed(ctx, data_type) && (interp_method == "nearest" || interp_method == "bilinear")) { diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index 7783303785998..4b5a18141d5aa 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -414,7 +414,7 @@ class InterpolateV2Op : public framework::OperatorWithKernel { auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); #ifdef PADDLE_WITH_MKLDNN - auto interp_method = ctx.Attr("interp_method"); + const auto& interp_method = ctx.Attr("interp_method"); // TODO(danqing): support other interp_method if (this->CanMKLDNNBeUsed(ctx, data_type) && (interp_method == "nearest" || interp_method == "bilinear")) { diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index 33ea36d24b8ae..04b90d2f1f380 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -53,17 +53,13 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { std::vector ComputeOutputShape( const framework::ExecutionContext& ctx) const { const auto* x = ctx.Input("X"); - auto in_dims = x->dims(); - const bool is_channel_last = false; // In mkldnn kernel, always use NCHW - - framework::DDim in_dhw_dims; - if (is_channel_last) { // NDHWC, NHWC, NWC - in_dhw_dims = phi::slice_ddim(in_dims, 1, in_dims.size() - 1); - } else { // NCDHW, NCHW, NCW - in_dhw_dims = phi::slice_ddim(in_dims, 2, in_dims.size()); - } + const auto& in_dims = x->dims(); + + const framework::DDim in_dhw_dims = + phi::slice_ddim(in_dims, 2, in_dims.size()); std::vector out_dims; + out_dims.reserve(5); if (in_dhw_dims.size() == 1) { out_dims.push_back(ctx.Attr("out_w")); } else if (in_dhw_dims.size() == 2) { @@ -125,12 +121,8 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { "out_d, out_h, out_w of Op(interpolate) " "should be greater than 0.")); - out_dims.insert(out_dims.begin(), in_dims[0]); - if (is_channel_last) { - out_dims.push_back(in_dims[in_dims.size() - 1]); - } else { - out_dims.insert(out_dims.begin() + 1, in_dims[1]); - } + const std::vector nc_dims = {in_dims[0], in_dims[1]}; + out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end()); return out_dims; } @@ -143,12 +135,12 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { const auto* x = ctx.Input("X"); auto* z = ctx.Output("Out"); - auto interp_method = ctx.Attr("interp_method"); - dnnl::algorithm algo = (interp_method == "nearest") - ? dnnl::algorithm::resampling_nearest - : dnnl::algorithm::resampling_linear; + const auto interp_method = ctx.Attr("interp_method"); + const dnnl::algorithm algo = (interp_method == "nearest") + ? dnnl::algorithm::resampling_nearest + : dnnl::algorithm::resampling_linear; - auto out_dims_vec = ComputeOutputShape(ctx); + const auto out_dims_vec = ComputeOutputShape(ctx); framework::DDim dim_out = phi::make_ddim(out_dims_vec); z->Resize(dim_out); @@ -162,6 +154,7 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { const std::unordered_map args = { {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}}; auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + resampling_prim->execute(astream, args); astream.wait(); @@ -184,6 +177,7 @@ REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, ops::InterpolateMKLDNNKernel, + ops::InterpolateMKLDNNKernel, ops::InterpolateMKLDNNKernel, ops::InterpolateMKLDNNKernel); REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py index 24ebf40216f4b..d72a1d53d3aa5 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_nearest_interp_v2_mkldnn_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci @@ -59,6 +59,7 @@ def nearest_neighbor_interp_mkldnn_np(X, @skip_check_grad_ci(reason="Haven not implement interpolate grad kernel.") +@OpTestTool.skip_if_not_cpu_bf16() class TestNearestInterpV2MKLDNNOp(OpTest): def init_test_case(self): pass @@ -84,7 +85,7 @@ def setUp(self): self.init_test_case() self.init_data_type() - if self.dtype == np.float32: + if self.dtype == np.float32 or self.dtype == np.uint16: input_np = np.random.random(self.input_shape).astype(self.dtype) else: init_low, init_high = (-5, 5) if self.dtype == np.int8 else (0, 10) @@ -126,6 +127,9 @@ def setUp(self): if isinstance(self.scale, float): self.scale = [self.scale] + if self.dtype == np.uint16: + input_np = convert_float_to_uint16(input_np) + self.inputs = {'X': input_np} if self.out_size is not None: self.inputs['OutSize'] = self.out_size @@ -191,6 +195,10 @@ class TestFp32Case(parent): def init_data_type(self): self.dtype = np.float32 + class TestBf16Case(parent): + def init_data_type(self): + self.dtype = np.uint16 + class TestInt8Case(parent): def init_data_type(self): self.dtype = np.int8 @@ -199,12 +207,14 @@ class TestUint8Case(parent): def init_data_type(self): self.dtype = np.uint8 - TestFp32Case.__name__ = parent.__name__ - TestInt8Case.__name__ = parent.__name__ - TestUint8Case.__name__ = parent.__name__ - globals()[parent.__name__] = TestFp32Case - globals()[parent.__name__] = TestInt8Case - globals()[parent.__name__] = TestUint8Case + TestFp32Case.__name__ = "{0}_{1}".format(parent.__name__, "FP32") + TestBf16Case.__name__ = "{0}_{1}".format(parent.__name__, "BF16") + TestInt8Case.__name__ = "{0}_{1}".format(parent.__name__, "INT8") + TestUint8Case.__name__ = "{0}_{1}".format(parent.__name__, "UINT8") + globals()[TestFp32Case.__name__] = TestFp32Case + globals()[TestBf16Case.__name__] = TestBf16Case + globals()[TestInt8Case.__name__] = TestInt8Case + globals()[TestUint8Case.__name__] = TestUint8Case create_test_class(TestNearestInterpV2MKLDNNOp) From 23bbd912a0af9df3095c0659c6dd2e264c22979a Mon Sep 17 00:00:00 2001 From: zmxdream Date: Thu, 24 Feb 2022 13:16:10 +0800 Subject: [PATCH 26/85] config fleet optimize. test=develop (#39849) --- paddle/fluid/framework/ps_gpu_trainer.cc | 81 ++++++++++++------------ paddle/fluid/framework/trainer.h | 1 + 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 0705f658ff5fe..e0cf860e5bc7b 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -46,6 +46,48 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, dense_grad_names_[table_id][j] = table.dense_grad_name(j); } } + InitializeGPUServer(trainer_desc); + scale_datanorm_ = trainer_desc.scale_datanorm(); + int place_num = trainer_desc.worker_places_size(); + const std::vector readers = + dataset->GetReaders(); + dump_file_num_ = trainer_desc.dump_file_num(); + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); + std::vector dev_ids; + for (int i = 0; i < place_num; ++i) { + int num = trainer_desc.worker_places(i); + platform::CUDAPlace place = platform::CUDAPlace(num); + places_.push_back(place); + dev_ids.push_back(num); + } + for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); + i++) { + need_merge_var_names_.push_back( + trainer_desc.downpour_param().stat_var_names(i)); + } + VLOG(3) << "going to initialize pull dense worker"; + SetDebug(trainer_desc.debug()); + trainer_desc_ = trainer_desc; + workers_.resize(place_num); + for (int i = 0; i < place_num; ++i) { + workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( + trainer_desc.device_worker_name()); + workers_[i]->SetDeviceIndex(i); + workers_[i]->SetNeedDumpField(need_dump_field_); + workers_[i]->SetNeedDumpParam(need_dump_param_); + workers_[i]->SetDumpFieldVector(dump_fields_); + workers_[i]->SetDumpParamVector(dump_param_); + workers_[i]->InitRandomDumpConfig(trainer_desc); + workers_[i]->SetDataFeed(readers[i]); + workers_[i]->SetPlace(places_[i]); + workers_[i]->SetReaderPlace(places_[i]); + workers_[i]->Initialize(trainer_desc); + workers_[i]->SetWorkerNum(place_num); + } + return; +} + +void PSGPUTrainer::InitializeGPUServer(const TrainerDesc& trainer_desc) { // add for hbmps optimizer config auto fleet_desc_str = trainer_desc.fleet_desc(); google::protobuf::TextFormat::ParseFromString(fleet_desc_str, &_ps_param); @@ -203,45 +245,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, auto ps_gpu_wrapper = paddle::framework::PSGPUWrapper::GetInstance(); ps_gpu_wrapper->InitializeGPUServer(config); - - scale_datanorm_ = trainer_desc.scale_datanorm(); - int place_num = trainer_desc.worker_places_size(); - const std::vector readers = - dataset->GetReaders(); - dump_file_num_ = trainer_desc.dump_file_num(); - user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); - std::vector dev_ids; - for (int i = 0; i < place_num; ++i) { - int num = trainer_desc.worker_places(i); - platform::CUDAPlace place = platform::CUDAPlace(num); - places_.push_back(place); - dev_ids.push_back(num); - } - for (int i = 0; i < trainer_desc.downpour_param().stat_var_names_size(); - i++) { - need_merge_var_names_.push_back( - trainer_desc.downpour_param().stat_var_names(i)); - } - VLOG(3) << "going to initialize pull dense worker"; - SetDebug(trainer_desc.debug()); - trainer_desc_ = trainer_desc; - workers_.resize(place_num); - for (int i = 0; i < place_num; ++i) { - workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( - trainer_desc.device_worker_name()); - workers_[i]->SetDeviceIndex(i); - workers_[i]->SetNeedDumpField(need_dump_field_); - workers_[i]->SetNeedDumpParam(need_dump_param_); - workers_[i]->SetDumpFieldVector(dump_fields_); - workers_[i]->SetDumpParamVector(dump_param_); - workers_[i]->InitRandomDumpConfig(trainer_desc); - workers_[i]->SetDataFeed(readers[i]); - workers_[i]->SetPlace(places_[i]); - workers_[i]->SetReaderPlace(places_[i]); - workers_[i]->Initialize(trainer_desc); - workers_[i]->SetWorkerNum(place_num); - } - return; } std::string PSGPUTrainer::GetDumpPath(int tid) { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 85eef89ee27f6..8a11775702e57 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -271,6 +271,7 @@ class PSGPUTrainer : public TrainerBase { template void MergeToRootScope(LoDTensor* root_tensor, LoDTensor* thread_tensor); + void InitializeGPUServer(const TrainerDesc& trainer_desc); protected: Dataset* dataset_; From bbe441fc74a203e464c0b73c47693a65818837ee Mon Sep 17 00:00:00 2001 From: Zhou Wei <1183042833@qq.com> Date: Thu, 24 Feb 2022 13:28:38 +0800 Subject: [PATCH 27/85] =?UTF-8?q?=E3=80=90Phi=E3=80=91Migrate=20poisson=20?= =?UTF-8?q?op=20into=20phi=20(#39814)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Migrate poisson op into phi * fix CI * fix comment --- paddle/fluid/operators/poisson_op.cc | 51 ++--------- paddle/fluid/operators/poisson_op.cu | 91 ------------------- paddle/fluid/operators/poisson_op.h | 41 --------- paddle/phi/infermeta/unary.h | 1 + paddle/phi/kernels/cpu/poisson_grad_kernel.cc | 19 ++++ paddle/phi/kernels/cpu/poisson_kernel.cc | 41 +++++++++ paddle/phi/kernels/gpu/poisson_grad_kernel.cu | 19 ++++ paddle/phi/kernels/gpu/poisson_kernel.cu | 77 ++++++++++++++++ .../kernels/impl/poisson_grad_kernel_impl.h | 29 ++++++ paddle/phi/kernels/poisson_grad_kernel.h | 25 +++++ paddle/phi/kernels/poisson_kernel.h | 24 +++++ paddle/phi/ops/compat/poisson_sig.cc | 26 ++++++ 12 files changed, 270 insertions(+), 174 deletions(-) delete mode 100644 paddle/fluid/operators/poisson_op.cu delete mode 100644 paddle/fluid/operators/poisson_op.h create mode 100644 paddle/phi/kernels/cpu/poisson_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/poisson_kernel.cc create mode 100644 paddle/phi/kernels/gpu/poisson_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/poisson_kernel.cu create mode 100644 paddle/phi/kernels/impl/poisson_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/poisson_grad_kernel.h create mode 100644 paddle/phi/kernels/poisson_kernel.h create mode 100644 paddle/phi/ops/compat/poisson_sig.cc diff --git a/paddle/fluid/operators/poisson_op.cc b/paddle/fluid/operators/poisson_op.cc index cc4b6e5e0756a..0cecbf0b9cb02 100644 --- a/paddle/fluid/operators/poisson_op.cc +++ b/paddle/fluid/operators/poisson_op.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include - -#include "paddle/fluid/operators/poisson_op.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,14 +25,6 @@ class PoissonOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "PoissonOp"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "PoissonOp"); - - auto dim = ctx->GetInputDim("X"); - ctx->SetOutputDim("Out", dim); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -61,29 +55,6 @@ class PoissonOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput { } }; -template -class PoissonKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - auto *out = ctx.Output("Out"); - - const T *x_data = x->data(); - T *out_data = out->mutable_data(ctx.GetPlace()); - - int64_t size = x->numel(); - - auto gen = framework::DefaultCPUGenerator(); - auto engine = gen->GetCPUEngine(); - - for (int64_t i = 0; i < size; ++i) { - std::poisson_distribution<> dist(x_data[i]); - out_data[i] = static_cast(dist(*engine)); - } - } -}; - class PoissonGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -116,17 +87,13 @@ class PoissonGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(poisson, PoissonInferShapeFunctor, + PT_INFER_META(phi::UnchangedInferMeta)); + REGISTER_OPERATOR(poisson, ops::PoissonOp, ops::PoissonOpMaker, ops::PoissonOpInferVarType, ops::PoissonGradOpMaker, - ops::PoissonGradOpMaker); + ops::PoissonGradOpMaker, + PoissonInferShapeFunctor); REGISTER_OPERATOR(poisson_grad, ops::PoissonGradOp); - -REGISTER_OP_CPU_KERNEL(poisson, - ops::PoissonKernel, - ops::PoissonKernel); - -REGISTER_OP_CPU_KERNEL(poisson_grad, - ops::PoissonGradKernel, - ops::PoissonGradKernel); diff --git a/paddle/fluid/operators/poisson_op.cu b/paddle/fluid/operators/poisson_op.cu deleted file mode 100644 index ef2f6d4665554..0000000000000 --- a/paddle/fluid/operators/poisson_op.cu +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef __NVCC__ -#include -#endif -#ifdef __HIPCC__ -#include -#endif -#include "paddle/fluid/operators/poisson_op.h" -#include "paddle/fluid/platform/for_range.h" - -namespace paddle { -namespace operators { - -template -struct PoissonCudaFunctor { - public: - PoissonCudaFunctor(const T* in, T* out, unsigned int seed, - unsigned int offset) - : in_(in), out_(out), seed_(seed), offset_(offset) {} - - __device__ void operator()(int64_t idx) { -#ifdef __NVCC__ - curandStatePhilox4_32_10_t state; - curand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(curand_poisson(&state, in_[idx])); -#elif __HIPCC__ - hiprandStatePhilox4_32_10_t state; - hiprand_init(seed_, idx, offset_, &state); - out_[idx] = static_cast(hiprand_poisson(&state, in_[idx])); -#endif - } - - private: - const T* in_; - T* out_; - const unsigned int seed_; - const unsigned int offset_; -}; - -template -class PoissonKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - const T* x_data = x->data(); - T* out_data = out->mutable_data(ctx.GetPlace()); - auto size = x->numel(); - int64_t device_id = ctx.GetPlace().GetDeviceId(); - - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - auto seed_offset = gen_cuda->IncrementOffset(20); - uint64_t seed = seed_offset.first; - uint64_t offset = seed_offset.second; - - auto& dev_ctx = ctx.template device_context(); - platform::ForRange for_range(dev_ctx, size); - - PoissonCudaFunctor functor(x_data, out_data, seed, offset); - for_range(functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL(poisson, - ops::PoissonKernel, - ops::PoissonKernel); - -REGISTER_OP_CUDA_KERNEL( - poisson_grad, ops::PoissonGradKernel, - ops::PoissonGradKernel); diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h deleted file mode 100644 index 2bcb5244012c7..0000000000000 --- a/paddle/fluid/operators/poisson_op.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PoissonKernel; - -template -class PoissonGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - phi::funcs::SetConstant functor; - auto& dev_ctx = ctx.template device_context(); - functor(dev_ctx, dx, static_cast(0)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 7d15f497ead14..21cbe76bb13c0 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -103,4 +103,5 @@ void UnfoldInferMeta(const MetaTensor& x, const std::vector& dilations, MetaTensor* out, MetaConfig config = MetaConfig()); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/poisson_grad_kernel.cc b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc new file mode 100644 index 0000000000000..4e274a7af9ff3 --- /dev/null +++ b/paddle/phi/kernels/cpu/poisson_grad_kernel.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + poisson_grad, CPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/poisson_kernel.cc b/paddle/phi/kernels/cpu/poisson_kernel.cc new file mode 100644 index 0000000000000..6a3e32c2f0785 --- /dev/null +++ b/paddle/phi/kernels/cpu/poisson_kernel.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/poisson_kernel.h" + +namespace phi { + +template +void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + const T* x_data = x.data(); + T* out_data = ctx.template Alloc(out); + int64_t size = x.numel(); + + auto gen = ctx.GetGenerator(); + auto engine = gen->GetCPUEngine(); + + for (int64_t i = 0; i < size; ++i) { + std::poisson_distribution<> dist(x_data[i]); + out_data[i] = static_cast(dist(*engine)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + poisson, CPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/poisson_grad_kernel.cu b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu new file mode 100644 index 0000000000000..8c16bc51fffe5 --- /dev/null +++ b/paddle/phi/kernels/gpu/poisson_grad_kernel.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/poisson_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + poisson_grad, GPU, ALL_LAYOUT, phi::PoissonGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu new file mode 100644 index 0000000000000..ae97f2fca68cb --- /dev/null +++ b/paddle/phi/kernels/gpu/poisson_kernel.cu @@ -0,0 +1,77 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef __NVCC__ +#include +#endif +#ifdef __HIPCC__ +#include +#endif + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/poisson_kernel.h" + +namespace phi { + +template +struct PoissonCudaFunctor { + public: + PoissonCudaFunctor(const T* in, + T* out, + unsigned int seed, + unsigned int offset) + : in_(in), out_(out), seed_(seed), offset_(offset) {} + + __device__ void operator()(int64_t idx) { +#ifdef __NVCC__ + curandStatePhilox4_32_10_t state; + curand_init(seed_, idx, offset_, &state); + out_[idx] = static_cast(curand_poisson(&state, in_[idx])); +#elif __HIPCC__ + hiprandStatePhilox4_32_10_t state; + hiprand_init(seed_, idx, offset_, &state); + out_[idx] = static_cast(hiprand_poisson(&state, in_[idx])); +#endif + } + + private: + const T* in_; + T* out_; + const unsigned int seed_; + const unsigned int offset_; +}; + +template +void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + const T* x_data = x.data(); + T* out_data = ctx.template Alloc(out); + auto size = x.numel(); + + auto gen_cuda = ctx.GetGenerator(); + auto seed_offset = gen_cuda->IncrementOffset(20); + uint64_t seed = seed_offset.first; + uint64_t offset = seed_offset.second; + + paddle::platform::ForRange for_range(ctx, size); + + PoissonCudaFunctor functor(x_data, out_data, seed, offset); + for_range(functor); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + poisson, GPU, ALL_LAYOUT, phi::PoissonKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h new file mode 100644 index 0000000000000..4e82cccac3422 --- /dev/null +++ b/paddle/phi/kernels/impl/poisson_grad_kernel_impl.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/poisson_grad_kernel.h" + +namespace phi { + +template +void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + phi::funcs::SetConstant functor; + functor(ctx, x_grad, static_cast(0)); +} + +} // namespace phi diff --git a/paddle/phi/kernels/poisson_grad_kernel.h b/paddle/phi/kernels/poisson_grad_kernel.h new file mode 100644 index 0000000000000..21720474f4a12 --- /dev/null +++ b/paddle/phi/kernels/poisson_grad_kernel.h @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +template +void PoissonGradKernel(const Context& ctx, DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/poisson_kernel.h b/paddle/phi/kernels/poisson_kernel.h new file mode 100644 index 0000000000000..f67c9c46311d1 --- /dev/null +++ b/paddle/phi/kernels/poisson_kernel.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/poisson_sig.cc b/paddle/phi/ops/compat/poisson_sig.cc new file mode 100644 index 0000000000000..cb6ae28804669 --- /dev/null +++ b/paddle/phi/ops/compat/poisson_sig.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PoissonGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("poisson_grad", {}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(poisson_grad, phi::PoissonGradOpArgumentMapping); From eb4ad5094af931766a69a36f48b6db40211e9bee Mon Sep 17 00:00:00 2001 From: xiaoting <31891223+tink2123@users.noreply.github.com> Date: Thu, 24 Feb 2022 14:12:53 +0800 Subject: [PATCH 28/85] [doc]Fix maxunpool2d example (#39862) * fix maxunpool2d example, test=document_fix * fix maxunpool2d example, test=document_fix --- python/paddle/nn/layer/pooling.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py index 68808c6354afb..c664c6e318c46 100755 --- a/python/paddle/nn/layer/pooling.py +++ b/python/paddle/nn/layer/pooling.py @@ -1257,9 +1257,8 @@ class MaxUnPool2D(Layer): import paddle import paddle.nn.functional as F - import numpy as np - data = paddle.rand(shape=[1,1,7,7]) + data = paddle.rand(shape=[1,1,6,6]) pool_out, indices = F.max_pool2d(data, kernel_size=2, stride=2, padding=0, return_mask=True) # pool_out shape: [1, 1, 3, 3], indices shape: [1, 1, 3, 3] Unpool2D = paddle.nn.MaxUnPool2D(kernel_size=2, padding=0) From 6fc5d88a378892ad0936626222983d8958aea24c Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Thu, 24 Feb 2022 14:15:20 +0800 Subject: [PATCH 29/85] [phi] move bce_loss to phi (#39868) * move bce_loss to phi * refine PADDLE_ENFORCE * revert PADDLE_ENFORCE * fix ci --- paddle/fluid/operators/bce_loss_op.cc | 52 ++------- paddle/fluid/operators/bce_loss_op.cu | 109 ------------------ paddle/fluid/operators/bce_loss_op.h | 85 -------------- paddle/fluid/operators/bce_loss_op_npu.cc | 2 +- paddle/phi/infermeta/binary.cc | 38 ++++++ paddle/phi/infermeta/binary.h | 4 + paddle/phi/kernels/bce_loss_grad_kernel.h | 28 +++++ paddle/phi/kernels/bce_loss_kernel.h | 27 +++++ .../phi/kernels/cpu/bce_loss_grad_kernel.cc | 47 ++++++++ paddle/phi/kernels/cpu/bce_loss_kernel.cc | 59 ++++++++++ .../phi/kernels/gpu/bce_loss_grad_kernel.cu | 59 ++++++++++ paddle/phi/kernels/gpu/bce_loss_kernel.cu | 64 ++++++++++ paddle/phi/ops/compat/bce_loss_sig.cc | 29 +++++ 13 files changed, 364 insertions(+), 239 deletions(-) delete mode 100644 paddle/fluid/operators/bce_loss_op.cu delete mode 100644 paddle/fluid/operators/bce_loss_op.h create mode 100644 paddle/phi/kernels/bce_loss_grad_kernel.h create mode 100644 paddle/phi/kernels/bce_loss_kernel.h create mode 100644 paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/bce_loss_kernel.cc create mode 100644 paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/bce_loss_kernel.cu create mode 100644 paddle/phi/ops/compat/bce_loss_sig.cc diff --git a/paddle/fluid/operators/bce_loss_op.cc b/paddle/fluid/operators/bce_loss_op.cc index 1c390923d0b0a..55bb57466c7b5 100644 --- a/paddle/fluid/operators/bce_loss_op.cc +++ b/paddle/fluid/operators/bce_loss_op.cc @@ -12,11 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bce_loss_op.h" #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/binary.h" + namespace paddle { namespace operators { @@ -26,41 +29,6 @@ class BCELossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "BCELoss"); - OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", "BCELoss"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss"); - - auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Label"); - - int rank = x_dims.size(); - PADDLE_ENFORCE_EQ(rank, labels_dims.size(), - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same rank." - "But received: the rank of Input(X) is [%d], " - "the rank of Input(Label) is [%d].", - rank, labels_dims.size())); - - bool check = true; - if ((!ctx->IsRuntime()) && - (phi::product(x_dims) <= 0 || phi::product(labels_dims) <= 0)) { - check = false; - } - - if (check) { - PADDLE_ENFORCE_EQ(x_dims, labels_dims, - platform::errors::InvalidArgument( - "Input(X) and Input(Label) shall have the same " - "shape. But received: the shape of Input(X) is " - "[%s], the shape of Input(Label) is [%s].", - x_dims, labels_dims)); - } - - ctx->ShareDim("X", "Out"); - ctx->ShareLoD("X", "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -170,16 +138,12 @@ DECLARE_INPLACE_OP_INFERER(BCELossGradInplaceInferer, } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(bce_loss, BCELossInferShapeFunctor, + PT_INFER_META(phi::BCELossInferMeta)); + REGISTER_OPERATOR(bce_loss, ops::BCELossOp, ops::BCELossOpMaker, ops::BCELossGradOpMaker, ops::BCELossGradOpMaker, - ops::BCELossInplaceInferer); + ops::BCELossInplaceInferer, BCELossInferShapeFunctor); REGISTER_OPERATOR(bce_loss_grad, ops::BCELossGradOp, ops::BCELossGradInplaceInferer); -REGISTER_OP_CPU_KERNEL( - bce_loss, ops::BCELossOpKernel, - ops::BCELossOpKernel); -REGISTER_OP_CPU_KERNEL( - bce_loss_grad, - ops::BCELossGradOpKernel, - ops::BCELossGradOpKernel); diff --git a/paddle/fluid/operators/bce_loss_op.cu b/paddle/fluid/operators/bce_loss_op.cu deleted file mode 100644 index f71fbbdc6b19e..0000000000000 --- a/paddle/fluid/operators/bce_loss_op.cu +++ /dev/null @@ -1,109 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include "paddle/fluid/operators/bce_loss_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" -#include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" -#include "paddle/phi/core/hostdevice.h" - -namespace paddle { -namespace operators { -template -struct BCELossFunctor { - T one; - T neg_100; - - HOSTDEVICE inline BCELossFunctor() { - one = static_cast(1.0f); - neg_100 = static_cast(-100.); - } - - HOSTDEVICE inline T operator()(const T x, const T label) const { - PADDLE_ENFORCE( - (x >= static_cast(0)) && (x <= one), - "Input is expected to be within the interval [0, 1], but recieved %f.", - x); - T term1 = max(real_log(x), neg_100); - T term2 = max(real_log(one - x), neg_100); - return (((label - one) * term2) - (label * term1)); - } -}; - -template -struct BCELossGradFunctor { - T one; - T eps; - - HOSTDEVICE inline BCELossGradFunctor() { - one = static_cast(1.0f); - eps = static_cast(1e-12); - } - - HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const { - T term1 = max((one - x) * x, eps); - return (dout * (x - label) / term1); - } -}; - -using Tensor = framework::Tensor; - -template -class BCELossCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - std::vector ins = {x, labels}; - std::vector outs = {out}; - auto& dev_ctx = ctx.template device_context(); - auto functor = BCELossFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -template -class BCELossGradCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - std::vector ins = {x, labels, dout}; - std::vector outs = {dx}; - auto& dev_ctx = ctx.template device_context(); - auto functor = BCELossGradFunctor(); - paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, - &outs, functor); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bce_loss, - ops::BCELossCUDAKernel, - ops::BCELossCUDAKernel); -REGISTER_OP_CUDA_KERNEL( - bce_loss_grad, - ops::BCELossGradCUDAKernel, - ops::BCELossGradCUDAKernel); diff --git a/paddle/fluid/operators/bce_loss_op.h b/paddle/fluid/operators/bce_loss_op.h deleted file mode 100644 index dd87b69efe286..0000000000000 --- a/paddle/fluid/operators/bce_loss_op.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include // for max -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class BCELossOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* out = ctx.Output("Out"); - - auto x_data = x->data(); - auto label_data = labels->data(); - auto out_data = out->mutable_data(ctx.GetPlace()); - auto x_numel = x->numel(); - - // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 - - // x) - label * ln(x) - for (int64_t i = 0; i < x_numel; ++i) { - PADDLE_ENFORCE_GE( - x_data[i], static_cast(0), - platform::errors::InvalidArgument( - "Illegal input, input must be greater than or equal to 0")); - PADDLE_ENFORCE_LE( - x_data[i], static_cast(1), - platform::errors::InvalidArgument( - "Illegal input, input must be less than or equal to 1")); - out_data[i] = - (label_data[i] - static_cast(1)) * - std::max(real_log(static_cast(1) - x_data[i]), (T)(-100)) - - label_data[i] * std::max(real_log(x_data[i]), (T)(-100)); - } - } -}; - -template -class BCELossGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* labels = ctx.Input("Label"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - - auto dx_data = dx->mutable_data(ctx.GetPlace()); - auto dout_data = dout->data(); - auto x_data = x->data(); - auto label_data = labels->data(); - - int x_numel = x->numel(); - - // dx = dout * ((x - label)/(x - x^2)) - for (int i = 0; i < x_numel; ++i) { - dx_data[i] = - dout_data[i] * ((x_data[i] - label_data[i]) / - std::max((static_cast(1) - x_data[i]) * x_data[i], - static_cast(1e-12))); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/bce_loss_op_npu.cc b/paddle/fluid/operators/bce_loss_op_npu.cc index 46e8a36d2eef7..c3cee6a7b0d5b 100644 --- a/paddle/fluid/operators/bce_loss_op_npu.cc +++ b/paddle/fluid/operators/bce_loss_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bce_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index e94926a9c1403..ab1fe5433f302 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -230,4 +230,42 @@ void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { out->set_dims(in_dims); } +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config) { + auto input_dims = input.dims(); + auto label_dims = label.dims(); + + int rank = input_dims.size(); + PADDLE_ENFORCE_EQ(rank, + label_dims.size(), + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same rank." + "But received: the rank of Input(X) is [%d], " + "the rank of Input(Label) is [%d].", + rank, + label_dims.size())); + + bool check = true; + if ((!config.is_runtime) && + (phi::product(input_dims) <= 0 || phi::product(label_dims) <= 0)) { + check = false; + } + + if (check) { + PADDLE_ENFORCE_EQ(input_dims, + label_dims, + phi::errors::InvalidArgument( + "Input(X) and Input(Label) shall have the same " + "shape. But received: the shape of Input(X) is " + "[%s], the shape of Input(Label) is [%s].", + input_dims, + label_dims)); + } + + out->set_dims(input_dims); + out->share_lod(input); +} + } // namespace phi diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index f23382be89b6a..effa18c5677f6 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -54,4 +54,8 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaConfig config = MetaConfig()); void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); +void BCELossInferMeta(const MetaTensor& input, + const MetaTensor& label, + MetaTensor* out, + MetaConfig config = MetaConfig()); } // namespace phi diff --git a/paddle/phi/kernels/bce_loss_grad_kernel.h b/paddle/phi/kernels/bce_loss_grad_kernel.h new file mode 100644 index 0000000000000..14bf52196ac40 --- /dev/null +++ b/paddle/phi/kernels/bce_loss_grad_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BCELossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + DenseTensor* input_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/bce_loss_kernel.h b/paddle/phi/kernels/bce_loss_kernel.h new file mode 100644 index 0000000000000..6459ea911666e --- /dev/null +++ b/paddle/phi/kernels/bce_loss_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BCELossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc new file mode 100644 index 0000000000000..6859451e8be32 --- /dev/null +++ b/paddle/phi/kernels/cpu/bce_loss_grad_kernel.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_grad_kernel.h" + +#include // for max +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void BCELossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + DenseTensor* input_grad) { + auto dx_data = dev_ctx.template Alloc(input_grad); + auto dout_data = out_grad.data(); + auto x_data = input.data(); + auto label_data = label.data(); + + int x_numel = input.numel(); + + // dx = dout * ((x - label)/(x - x^2)) + for (int i = 0; i < x_numel; ++i) { + dx_data[i] = + dout_data[i] * ((x_data[i] - label_data[i]) / + std::max((static_cast(1) - x_data[i]) * x_data[i], + static_cast(1e-12))); + } +} +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss_grad, CPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/bce_loss_kernel.cc b/paddle/phi/kernels/cpu/bce_loss_kernel.cc new file mode 100644 index 0000000000000..76b9793651484 --- /dev/null +++ b/paddle/phi/kernels/cpu/bce_loss_kernel.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_kernel.h" + +#include // for max +#include "paddle/fluid/operators/math.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void BCELossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + DenseTensor* out) { + auto x_data = input.data(); + auto label_data = label.data(); + auto out_data = dev_ctx.template Alloc(out); + auto x_numel = input.numel(); + + // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 - + // x) - label * ln(x) + for (int64_t i = 0; i < x_numel; ++i) { + PADDLE_ENFORCE_GE( + x_data[i], + static_cast(0), + phi::errors::InvalidArgument( + "Illegal input, input must be greater than or equal to 0")); + PADDLE_ENFORCE_LE( + x_data[i], + static_cast(1), + phi::errors::InvalidArgument( + "Illegal input, input must be less than or equal to 1")); + out_data[i] = + (label_data[i] - static_cast(1)) * + std::max(paddle::operators::real_log(static_cast(1) - x_data[i]), + (T)(-100)) - + label_data[i] * + std::max(paddle::operators::real_log(x_data[i]), (T)(-100)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss, CPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu new file mode 100644 index 0000000000000..94eabac4d1306 --- /dev/null +++ b/paddle/phi/kernels/gpu/bce_loss_grad_kernel.cu @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_grad_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" + +namespace phi { + +template +struct BCELossGradFunctor { + T one; + T eps; + + HOSTDEVICE inline BCELossGradFunctor() { + one = static_cast(1.0f); + eps = static_cast(1e-12); + } + + HOSTDEVICE inline T operator()(const T x, const T label, const T dout) const { + T term1 = max((one - x) * x, eps); + return (dout * (x - label) / term1); + } +}; + +template +void BCELossGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + const DenseTensor& out_grad, + DenseTensor* input_grad) { + dev_ctx.template Alloc(input_grad); + std::vector ins = {&input, &label, &out_grad}; + std::vector outs = {input_grad}; + auto functor = BCELossGradFunctor(); + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss_grad, GPU, ALL_LAYOUT, phi::BCELossGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/bce_loss_kernel.cu b/paddle/phi/kernels/gpu/bce_loss_kernel.cu new file mode 100644 index 0000000000000..adbcd3b2b6207 --- /dev/null +++ b/paddle/phi/kernels/gpu/bce_loss_kernel.cu @@ -0,0 +1,64 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bce_loss_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/primitive/functor_primitives.h" + +namespace phi { + +template +struct BCELossFunctor { + T one; + T neg_100; + + HOSTDEVICE inline BCELossFunctor() { + one = static_cast(1.0f); + neg_100 = static_cast(-100.); + } + + HOSTDEVICE inline T operator()(const T x, const T label) const { + PADDLE_ENFORCE( + (x >= static_cast(0)) && (x <= one), + "Input is expected to be within the interval [0, 1], but recieved %f.", + x); + T term1 = max(phi::kps::details::Log(x), neg_100); + T term2 = max(phi::kps::details::Log(one - x), neg_100); + return (((label - one) * term2) - (label * term1)); + } +}; + +template +void BCELossKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& label, + DenseTensor* out) { + dev_ctx.template Alloc(out); + std::vector ins = {&input, &label}; + std::vector outs = {out}; + auto functor = BCELossFunctor(); + phi::funcs::ElementwiseKernel(dev_ctx, ins, &outs, functor); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + bce_loss, GPU, ALL_LAYOUT, phi::BCELossKernel, float, double) {} diff --git a/paddle/phi/ops/compat/bce_loss_sig.cc b/paddle/phi/ops/compat/bce_loss_sig.cc new file mode 100644 index 0000000000000..17f76067d22db --- /dev/null +++ b/paddle/phi/ops/compat/bce_loss_sig.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BCELossGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bce_loss_grad", + {"X", "Label", GradVarName("Out")}, + {}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bce_loss_grad, phi::BCELossGradOpArgumentMapping); From 6c358a7c22fe9acd35cae13f7debc8200350d0ee Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Thu, 24 Feb 2022 15:34:31 +0800 Subject: [PATCH 30/85] [Phi]Move cross OP to phi (#39829) * move cross forward OP * move cross grad op to phi * move infershape * refine infershape * rename ctx * set dtype and layout in InferMeta * refine code --- paddle/fluid/operators/cross_op.cc | 69 +----- paddle/fluid/operators/cross_op.cu | 28 --- paddle/fluid/operators/cross_op.h | 222 ------------------ paddle/phi/infermeta/binary.cc | 45 ++++ paddle/phi/infermeta/binary.h | 5 + paddle/phi/kernels/cpu/cross_grad_kernel.cc | 28 +++ paddle/phi/kernels/cpu/cross_kernel.cc | 22 ++ paddle/phi/kernels/cross_grad_kernel.h | 30 +++ paddle/phi/kernels/cross_kernel.h | 28 +++ paddle/phi/kernels/funcs/common_shape.h | 12 + paddle/phi/kernels/gpu/cross_grad_kernel.cu | 28 +++ paddle/phi/kernels/gpu/cross_kernel.cu | 22 ++ .../phi/kernels/impl/cross_grad_kernel_impl.h | 113 +++++++++ paddle/phi/kernels/impl/cross_kernel_impl.h | 116 +++++++++ paddle/phi/ops/compat/cross_sig.cc | 33 +++ 15 files changed, 491 insertions(+), 310 deletions(-) delete mode 100644 paddle/fluid/operators/cross_op.cu delete mode 100644 paddle/fluid/operators/cross_op.h create mode 100644 paddle/phi/kernels/cpu/cross_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cross_kernel.cc create mode 100644 paddle/phi/kernels/cross_grad_kernel.h create mode 100644 paddle/phi/kernels/cross_kernel.h create mode 100644 paddle/phi/kernels/gpu/cross_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cross_kernel.cu create mode 100644 paddle/phi/kernels/impl/cross_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/cross_kernel_impl.h create mode 100644 paddle/phi/ops/compat/cross_sig.cc diff --git a/paddle/fluid/operators/cross_op.cc b/paddle/fluid/operators/cross_op.cc index e6b30ba42fc26..fe00ee06603f0 100644 --- a/paddle/fluid/operators/cross_op.cc +++ b/paddle/fluid/operators/cross_op.cc @@ -12,67 +12,23 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/cross_op.h" #include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/binary.h" namespace paddle { namespace operators { using framework::Tensor; using framework::DDim; +const int kDefaultDim = framework::DDim::kMaxRank; class CrossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, - platform::errors::InvalidArgument( - "Input(X) of CrossOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput("Y"), true, - platform::errors::InvalidArgument( - "Input(Index) of CrossOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument( - "Output(Out) of CrossOp should not be null.")); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - auto dim = ctx->Attrs().Get("dim"); - - bool dims_match = CheckDims(x_dim, y_dim); - PADDLE_ENFORCE_EQ(dims_match, true, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - x_dim, y_dim)); - - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < x_dim.size() && dim >= (0 - x_dim.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - x_dim.size(), x_dim.size() - 1, dim)); - if (dim < 0) { - dim += x_dim.size(); - } - PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims()[dim] should be equal to 3." - "But received Input(X/Y).dims()[dim] = %d.", - x_dim[dim])); - } - - ctx->SetOutputDim("Out", x_dim); - auto type = ctx->GetInputsVarType("X")[0]; - if (type == framework::proto::VarType::LOD_TENSOR) { - ctx->ShareLoD("X", /*->*/ "Out"); - } - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -153,17 +109,10 @@ class CrossGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(cross, CrossInferShapeFunctor, + PT_INFER_META(phi::CrossInferMeta)); REGISTER_OPERATOR(cross, ops::CrossOp, ops::CrossOpMaker, ops::CrossGradMaker, - ops::CrossGradMaker); + ops::CrossGradMaker, + CrossInferShapeFunctor); REGISTER_OPERATOR(cross_grad, ops::CrossGradOp); -REGISTER_OP_CPU_KERNEL( - cross, ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel); -REGISTER_OP_CPU_KERNEL( - cross_grad, ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel); diff --git a/paddle/fluid/operators/cross_op.cu b/paddle/fluid/operators/cross_op.cu deleted file mode 100644 index 78bbb3ea56454..0000000000000 --- a/paddle/fluid/operators/cross_op.cu +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/cross_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - cross, ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel, - ops::CrossKernel); -REGISTER_OP_CUDA_KERNEL( - cross_grad, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel, - ops::CrossGradKernel); diff --git a/paddle/fluid/operators/cross_op.h b/paddle/fluid/operators/cross_op.h deleted file mode 100644 index b1c5eb62fdce5..0000000000000 --- a/paddle/fluid/operators/cross_op.h +++ /dev/null @@ -1,222 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using DDim = framework::DDim; -const int kDefaultDim = framework::DDim::kMaxRank; - -inline bool CheckDims(const DDim& dims_x, const DDim& dims_y) { - if (dims_x.size() != dims_y.size()) { - return false; - } - for (int i = 0; i < dims_x.size(); i++) { - if (dims_x[i] != dims_y[i]) { - return false; - } - } - return true; -} - -template -class CrossKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_x_var = context.InputVar("X"); - auto* input_y_var = context.InputVar("Y"); - auto* output_var = context.OutputVar("Out"); - - auto& input_x = input_x_var->Get(); - auto& input_y = input_y_var->Get(); - auto* output = output_var->GetMutable(); - int dim = context.Attr("dim"); - - auto input_x_dims = input_x.dims(); - auto input_y_dims = input_y.dims(); - bool dims_match = CheckDims(input_x_dims, input_y_dims); - PADDLE_ENFORCE_EQ(dims_match, true, - platform::errors::InvalidArgument( - "The 'shape' of Input(X) should be equal to " - "the 'shape' of Input(Y). But received " - "Input(X).dimensions = [%s], " - "Input(Y).dimensions = [%s]", - input_x_dims, input_x_dims)); - - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_x_dims.size(), input_x_dims.size() - 1, dim)); - if (dim < 0) { - dim += input_x_dims.size(); - } - - PADDLE_ENFORCE_EQ( - input_x_dims[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims[dim] must be equal to 3. But received: " - "Input(X/Y).dims[dim] = [%d].", - input_x_dims[dim])); - } else { - for (auto i = 0; i < input_x_dims.size(); i++) { - if (input_x_dims[i] == 3) { - dim = i; - break; - } - } - PADDLE_ENFORCE_EQ(dim == kDefaultDim, false, - platform::errors::InvalidArgument( - "There must be at least one dimension 'd' so that " - "Input(X/Y).dims()[d] is equal to 3. " - "But received: Input(X/Y).dims() == [%s].", - input_x_dims)); - } - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_x_dims[i]; - } - auto slice_size = 1; - for (auto i = dim + 1; i < input_x_dims.size(); i++) { - slice_size *= input_x_dims[i]; - } - - std::vector input_x_vec, input_y_vec; - framework::TensorToVector(input_x, context.device_context(), &input_x_vec); - framework::TensorToVector(input_y, context.device_context(), &input_y_vec); - std::vector out_vec(output->numel()); - - output->mutable_data(context.GetPlace()); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < 3; j++) { - auto dst_pos = (3 * i + j) * slice_size; - auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; - auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; - - for (auto k = 0; k < slice_size; k++) { - out_vec[dst_pos + k] = - input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] - - input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k]; - } - } - } - framework::TensorFromVector(out_vec, context.device_context(), output); - output->Resize(input_x_dims); - } -}; - -template -class CrossGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* input_x_var = context.InputVar("X"); - auto* input_y_var = context.InputVar("Y"); - auto* input_out_grad_var = context.InputVar(framework::GradVarName("Out")); - auto* output_x_grad_var = context.OutputVar(framework::GradVarName("X")); - auto* output_y_grad_var = context.OutputVar(framework::GradVarName("Y")); - - auto& input_x = input_x_var->Get(); - auto& input_y = input_y_var->Get(); - auto& input_out_grad = input_out_grad_var->Get(); - auto* output_x_grad = output_x_grad_var->GetMutable(); - auto* output_y_grad = output_y_grad_var->GetMutable(); - - int dim = context.Attr("dim"); - auto input_x_dims = input_x.dims(); - if (dim != kDefaultDim) { - PADDLE_ENFORCE_EQ( - dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), true, - platform::errors::OutOfRange( - "Attr(dim) is out of range, It's expected " - "to be in range of [-%d, %d]. But received Attr(dim) = %d.", - input_x_dims.size(), input_x_dims.size() - 1, dim)); - if (dim < 0) { - dim += input_x_dims.size(); - } - - PADDLE_ENFORCE_EQ( - input_x_dims[dim] == 3, true, - platform::errors::InvalidArgument( - "Input(X/Y).dims[dim] must be equal to 3. But received: " - "Input(X/Y).dims[dim] = [%d].", - input_x_dims[dim])); - } else { - for (auto i = 0; i < input_x_dims.size(); i++) { - if (input_x_dims[i] == 3) { - dim = i; - break; - } - } - PADDLE_ENFORCE_EQ(dim == kDefaultDim, false, - platform::errors::InvalidArgument( - "There must be at least one dimension 'd' " - "so that Input(X/Y).dims()[d] is equal to 3. " - "But received: Input(X/Y).dims() == [%s].", - input_x_dims)); - } - auto outer_loops = 1; - for (auto i = 0; i < dim; i++) { - outer_loops *= input_x_dims[i]; - } - auto slice_size = 1; - for (auto i = dim + 1; i < input_x_dims.size(); i++) { - slice_size *= input_x_dims[i]; - } - - std::vector input_x_vec, input_y_vec, input_dout_vec; - framework::TensorToVector(input_x, context.device_context(), &input_x_vec); - framework::TensorToVector(input_y, context.device_context(), &input_y_vec); - framework::TensorToVector(input_out_grad, context.device_context(), - &input_dout_vec); - std::vector out_dx_vec(output_x_grad->numel()); - std::vector out_dy_vec(output_y_grad->numel()); - - output_x_grad->mutable_data(context.GetPlace()); - output_y_grad->mutable_data(context.GetPlace()); - - for (auto i = 0; i < outer_loops; i++) { - for (auto j = 0; j < 3; j++) { - auto dst_pos = (3 * i + j) * slice_size; - auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; - auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; - for (auto k = 0; k < slice_size; k++) { - out_dx_vec[dst_pos + k] = - input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] - - input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k]; - out_dy_vec[dst_pos + k] = - input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] - - input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k]; - } - } - } - framework::TensorFromVector(out_dx_vec, context.device_context(), - output_x_grad); - framework::TensorFromVector(out_dy_vec, context.device_context(), - output_y_grad); - output_x_grad->Resize(input_x_dims); - output_y_grad->Resize(input_x_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index ab1fe5433f302..58cd43998b8a5 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -225,6 +225,51 @@ void HuberLossInferMeta(const MetaTensor& input, out->share_lod(input); } +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out) { + auto x_dim = x.dims(); + auto y_dim = y.dims(); + auto dim = axis; + + bool dims_match = phi::funcs::CheckDims(x_dim, y_dim); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + x_dim, + y_dim)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < x_dim.size() && dim >= (0 - x_dim.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + x_dim.size(), + x_dim.size() - 1, + dim)); + if (dim < 0) { + dim += x_dim.size(); + } + PADDLE_ENFORCE_EQ(x_dim[dim] == 3 && y_dim[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims()[dim] should be equal to 3." + "But received Input(X/Y).dims()[dim] = %d.", + x_dim[dim])); + } + out->set_dims(x_dim); + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); +} + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) { auto in_dims = x.dims(); out->set_dims(in_dims); diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index effa18c5677f6..02750482dccaa 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta, MetaTensor* residual, MetaConfig config = MetaConfig()); +void CrossInferMeta(const MetaTensor& x, + const MetaTensor& y, + int axis, + MetaTensor* out); + void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out); void BCELossInferMeta(const MetaTensor& input, const MetaTensor& label, diff --git a/paddle/phi/kernels/cpu/cross_grad_kernel.cc b/paddle/phi/kernels/cpu/cross_grad_kernel.cc new file mode 100644 index 0000000000000..390420008e6ea --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_grad_kernel.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_grad_kernel.h" +#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(cross_grad, + CPU, + ALL_LAYOUT, + phi::CrossGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/cross_kernel.cc b/paddle/phi/kernels/cpu/cross_kernel.cc new file mode 100644 index 0000000000000..a63f33174eacd --- /dev/null +++ b/paddle/phi/kernels/cpu/cross_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_kernel.h" +#include "paddle/phi/kernels/impl/cross_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + cross, CPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/cross_grad_kernel.h b/paddle/phi/kernels/cross_grad_kernel.h new file mode 100644 index 0000000000000..9ea0804a94b6b --- /dev/null +++ b/paddle/phi/kernels/cross_grad_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/cross_kernel.h b/paddle/phi/kernels/cross_kernel.h new file mode 100644 index 0000000000000..567889e078345 --- /dev/null +++ b/paddle/phi/kernels/cross_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/common_shape.h b/paddle/phi/kernels/funcs/common_shape.h index 8bd9867f39edd..d5289dcc22cbc 100644 --- a/paddle/phi/kernels/funcs/common_shape.h +++ b/paddle/phi/kernels/funcs/common_shape.h @@ -128,5 +128,17 @@ static void GetBroadcastDims(const DDim &in_dims, } } +inline bool CheckDims(const DDim &dims_x, const DDim &dims_y) { + if (dims_x.size() != dims_y.size()) { + return false; + } + for (int i = 0; i < dims_x.size(); i++) { + if (dims_x[i] != dims_y[i]) { + return false; + } + } + return true; +} + } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/cross_grad_kernel.cu b/paddle/phi/kernels/gpu/cross_grad_kernel.cu new file mode 100644 index 0000000000000..1bb0d42dad81a --- /dev/null +++ b/paddle/phi/kernels/gpu/cross_grad_kernel.cu @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_grad_kernel.h" +#include "paddle/phi/kernels/impl/cross_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(cross_grad, + GPU, + ALL_LAYOUT, + phi::CrossGradKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/cross_kernel.cu b/paddle/phi/kernels/gpu/cross_kernel.cu new file mode 100644 index 0000000000000..aa944f8291674 --- /dev/null +++ b/paddle/phi/kernels/gpu/cross_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cross_kernel.h" +#include "paddle/phi/kernels/impl/cross_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + cross, GPU, ALL_LAYOUT, phi::CrossKernel, float, double, int, int64_t) {} diff --git a/paddle/phi/kernels/impl/cross_grad_kernel_impl.h b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h new file mode 100644 index 0000000000000..99a79dc15c049 --- /dev/null +++ b/paddle/phi/kernels/impl/cross_grad_kernel_impl.h @@ -0,0 +1,113 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CrossGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto& input_x = x; + auto& input_y = y; + auto& input_out_grad = out_grad; + auto* output_x_grad = x_grad; + auto* output_y_grad = y_grad; + int dim = axis; + auto input_x_dims = input_x.dims(); + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), + true, + errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + input_x_dims.size(), + input_x_dims.size() - 1, + dim)); + if (dim < 0) { + dim += input_x_dims.size(); + } + + PADDLE_ENFORCE_EQ( + input_x_dims[dim] == 3, + true, + errors::InvalidArgument( + "Input(X/Y).dims[dim] must be equal to 3. But received: " + "Input(X/Y).dims[dim] = [%d].", + input_x_dims[dim])); + } else { + for (auto i = 0; i < input_x_dims.size(); i++) { + if (input_x_dims[i] == 3) { + dim = i; + break; + } + } + PADDLE_ENFORCE_EQ( + dim == DDim::kMaxRank, + false, + errors::InvalidArgument("There must be at least one dimension 'd' " + "so that Input(X/Y).dims()[d] is equal to 3. " + "But received: Input(X/Y).dims() == [%s].", + input_x_dims)); + } + auto outer_loops = 1; + for (auto i = 0; i < dim; i++) { + outer_loops *= input_x_dims[i]; + } + auto slice_size = 1; + for (auto i = dim + 1; i < input_x_dims.size(); i++) { + slice_size *= input_x_dims[i]; + } + + std::vector input_x_vec, input_y_vec, input_dout_vec; + paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec); + paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec); + paddle::framework::TensorToVector(input_out_grad, dev_ctx, &input_dout_vec); + std::vector out_dx_vec(output_x_grad->numel()); + std::vector out_dy_vec(output_y_grad->numel()); + + dev_ctx.template Alloc(output_x_grad); + dev_ctx.template Alloc(output_y_grad); + + for (auto i = 0; i < outer_loops; i++) { + for (auto j = 0; j < 3; j++) { + auto dst_pos = (3 * i + j) * slice_size; + auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; + auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; + for (auto k = 0; k < slice_size; k++) { + out_dx_vec[dst_pos + k] = + input_dout_vec[in_pos2 + k] * input_y_vec[in_pos1 + k] - + input_dout_vec[in_pos1 + k] * input_y_vec[in_pos2 + k]; + out_dy_vec[dst_pos + k] = + input_dout_vec[in_pos1 + k] * input_x_vec[in_pos2 + k] - + input_dout_vec[in_pos2 + k] * input_x_vec[in_pos1 + k]; + } + } + } + paddle::framework::TensorFromVector(out_dx_vec, dev_ctx, output_x_grad); + paddle::framework::TensorFromVector(out_dy_vec, dev_ctx, output_y_grad); + output_x_grad->Resize(input_x_dims); + output_y_grad->Resize(input_x_dims); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/cross_kernel_impl.h b/paddle/phi/kernels/impl/cross_kernel_impl.h new file mode 100644 index 0000000000000..6427d7f87193f --- /dev/null +++ b/paddle/phi/kernels/impl/cross_kernel_impl.h @@ -0,0 +1,116 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/common_shape.h" + +namespace phi { + +template +void CrossKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + int axis, + DenseTensor* out) { + auto& input_x = x; + auto& input_y = y; + auto* output = out; + int dim = axis; + + auto input_x_dims = input_x.dims(); + auto input_y_dims = input_y.dims(); + bool dims_match = phi::funcs::CheckDims(input_x_dims, input_y_dims); + PADDLE_ENFORCE_EQ( + dims_match, + true, + phi::errors::InvalidArgument("The 'shape' of Input(X) should be equal to " + "the 'shape' of Input(Y). But received " + "Input(X).dimensions = [%s], " + "Input(Y).dimensions = [%s]", + input_x_dims, + input_x_dims)); + + if (dim != DDim::kMaxRank) { + PADDLE_ENFORCE_EQ( + dim < input_x_dims.size() && dim >= (0 - input_x_dims.size()), + true, + phi::errors::OutOfRange( + "Attr(dim) is out of range, It's expected " + "to be in range of [-%d, %d]. But received Attr(dim) = %d.", + input_x_dims.size(), + input_x_dims.size() - 1, + dim)); + if (dim < 0) { + dim += input_x_dims.size(); + } + + PADDLE_ENFORCE_EQ( + input_x_dims[dim] == 3, + true, + phi::errors::InvalidArgument( + "Input(X/Y).dims[dim] must be equal to 3. But received: " + "Input(X/Y).dims[dim] = [%d].", + input_x_dims[dim])); + } else { + for (auto i = 0; i < input_x_dims.size(); i++) { + if (input_x_dims[i] == 3) { + dim = i; + break; + } + } + PADDLE_ENFORCE_EQ(dim == DDim::kMaxRank, + false, + phi::errors::InvalidArgument( + "There must be at least one dimension 'd' so that " + "Input(X/Y).dims()[d] is equal to 3. " + "But received: Input(X/Y).dims() == [%s].", + input_x_dims)); + } + auto outer_loops = 1; + for (auto i = 0; i < dim; i++) { + outer_loops *= input_x_dims[i]; + } + auto slice_size = 1; + for (auto i = dim + 1; i < input_x_dims.size(); i++) { + slice_size *= input_x_dims[i]; + } + + std::vector input_x_vec, input_y_vec; + paddle::framework::TensorToVector(input_x, dev_ctx, &input_x_vec); + paddle::framework::TensorToVector(input_y, dev_ctx, &input_y_vec); + std::vector out_vec(output->numel()); + + dev_ctx.template Alloc(output); + + for (auto i = 0; i < outer_loops; i++) { + for (auto j = 0; j < 3; j++) { + auto dst_pos = (3 * i + j) * slice_size; + auto in_pos1 = (3 * i + ((j + 1) % 3)) * slice_size; + auto in_pos2 = (3 * i + ((j + 2) % 3)) * slice_size; + + for (auto k = 0; k < slice_size; k++) { + out_vec[dst_pos + k] = + input_x_vec[in_pos1 + k] * input_y_vec[in_pos2 + k] - + input_x_vec[in_pos2 + k] * input_y_vec[in_pos1 + k]; + } + } + } + paddle::framework::TensorFromVector(out_vec, dev_ctx, output); + output->Resize(input_x_dims); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/cross_sig.cc b/paddle/phi/ops/compat/cross_sig.cc new file mode 100644 index 0000000000000..307c2ac5164b5 --- /dev/null +++ b/paddle/phi/ops/compat/cross_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature CrossOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("cross", {"X", "Y"}, {"dim"}, {"Out"}); +} + +KernelSignature CrossGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("cross_grad", + {"X", "Y", GradVarName("Out")}, + {"dim"}, + {GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(cross, phi::CrossOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cross_grad, phi::CrossGradOpArgumentMapping); From 539fb0d7f551b06de5af84f2b40c3ca65bd1a7f5 Mon Sep 17 00:00:00 2001 From: crystal <62974595+Zjq9409@users.noreply.github.com> Date: Thu, 24 Feb 2022 16:17:18 +0800 Subject: [PATCH 31/85] Fix unittests for eigh op (#39568) * fix eigh test * modify atol and rtol --- .../fluid/tests/unittests/test_eigh_op.py | 41 ++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_eigh_op.py b/python/paddle/fluid/tests/unittests/test_eigh_op.py index 8e8c9df199f14..3e8230e5d0c62 100644 --- a/python/paddle/fluid/tests/unittests/test_eigh_op.py +++ b/python/paddle/fluid/tests/unittests/test_eigh_op.py @@ -59,8 +59,12 @@ def setUp(self): self.dtype = "float32" np.random.seed(123) self.x_np = np.random.random(self.x_shape).astype(self.dtype) - self.rtol = 1e-5 - self.atol = 1e-5 + if (paddle.version.cuda() >= "11.6"): + self.rtol = 5e-6 + self.atol = 6e-5 + else: + self.rtol = 1e-5 + self.atol = 1e-5 def test_check_output_gpu(self): if paddle.is_compiled_with_cuda(): @@ -79,23 +83,30 @@ def test_check_output_gpu(self): class TestEighAPI(unittest.TestCase): def setUp(self): - self.init_input_shape() - self.dtype = "float32" + self.init_input_data() self.UPLO = 'L' - self.rtol = 1e-6 - self.atol = 1e-6 + if (paddle.version.cuda() >= "11.6"): + self.rtol = 5e-6 + self.atol = 6e-5 + else: + self.rtol = 1e-5 + self.atol = 1e-5 self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \ else paddle.CPUPlace() np.random.seed(123) + + def init_input_data(self): + self.x_shape = [5, 5] + self.dtype = "float32" self.real_data = np.random.random(self.x_shape).astype(self.dtype) - self.complex_data = np.random.random(self.x_shape).astype( + complex_data = np.random.random(self.x_shape).astype( self.dtype) + 1J * np.random.random(self.x_shape).astype(self.dtype) self.trans_dims = list(range(len(self.x_shape) - 2)) + [ len(self.x_shape) - 1, len(self.x_shape) - 2 ] - - def init_input_shape(self): - self.x_shape = [5, 5] + #build a random conjugate matrix + self.complex_symm = np.divide( + complex_data + np.conj(complex_data.transpose(self.trans_dims)), 2) def compare_result(self, actual_w, actual_v, expected_w, expected_v): np.testing.assert_allclose( @@ -129,9 +140,9 @@ def check_static_complex_result(self): exe = paddle.static.Executor(self.place) expected_w, expected_v = exe.run( main_prog, - feed={"input_x": self.complex_data}, + feed={"input_x": self.complex_symm}, fetch_list=[output_w, output_v]) - actual_w, actual_v = np.linalg.eigh(self.complex_data) + actual_w, actual_v = np.linalg.eigh(self.complex_symm) self.compare_result(actual_w, actual_v, expected_w, expected_v) def test_in_static_mode(self): @@ -146,14 +157,14 @@ def test_in_dynamic_mode(self): actual_w, actual_v = paddle.linalg.eigh(input_real_data) self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v) - input_complex_data = paddle.to_tensor(self.complex_data) - expected_w, expected_v = np.linalg.eigh(self.complex_data) + input_complex_data = paddle.to_tensor(self.complex_symm) + expected_w, expected_v = np.linalg.eigh(self.complex_symm) actual_w, actual_v = paddle.linalg.eigh(input_complex_data) self.compare_result(actual_w, actual_v.numpy(), expected_w, expected_v) def test_eigh_grad(self): paddle.disable_static() - x = paddle.to_tensor(self.complex_data, stop_gradient=False) + x = paddle.to_tensor(self.complex_symm, stop_gradient=False) w, v = paddle.linalg.eigh(x) (w.sum() + paddle.abs(v).sum()).backward() np.testing.assert_allclose( From df0b4434e90e72ff7ac4cef3b7365deb71925865 Mon Sep 17 00:00:00 2001 From: Lijunhui <1578034415@qq.com> Date: Thu, 24 Feb 2022 16:52:13 +0800 Subject: [PATCH 32/85] Optimize nearest_interp backward (#39067) * nearest_interp_bw init * optimize kernel config * optimize kernel config * fix struct init * optimize code * rm duplicated struct --- paddle/fluid/operators/interpolate_v2_op.cu | 102 ++++++++++++++------ 1 file changed, 73 insertions(+), 29 deletions(-) diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6e9d6a1995474..d61eb46d97e98 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -210,32 +210,66 @@ __global__ void KeNearestNeighbor3DInterpFw( } } +template +__global__ void KeNearestNeighborInterpNCHWBw( + T* in, const size_t in_img_h, const size_t in_img_w, const T* out, + const size_t out_img_h, const size_t out_img_w, const size_t nc, + const float ratio_h, const float ratio_w, const bool align_corners) { + int out_img_idx = threadIdx.x + blockIdx.x * blockDim.x; + int out_img_idy = threadIdx.y + blockIdx.y * blockDim.y; + int nc_id = threadIdx.z + blockIdx.z * blockDim.z; + int nc_stride = blockDim.z * gridDim.z; + + // nearest_sampling by multiple read in_addr and write to out_addr + int in_img_idx = (align_corners) + ? static_cast(ratio_w * out_img_idx + 0.5) + : static_cast(ratio_w * out_img_idx); + int in_img_idy = (align_corners) + ? static_cast(ratio_h * out_img_idy + 0.5) + : static_cast(ratio_h * out_img_idy); + + int in_index = (nc_id * in_img_h + in_img_idy) * in_img_w + in_img_idx; + int in_index_stride = nc_stride * in_img_h * in_img_w; + + int out_index = (nc_id * out_img_h + out_img_idy) * out_img_w + out_img_idx; + int out_index_stride = nc_stride * out_img_h * out_img_w; + + // prevent from multiple threads writing + if (out_img_idx < out_img_w && out_img_idy < out_img_h) { + while (nc_id < nc) { + T* in_pos = &in[in_index]; + const T out_pos = out[out_index]; + platform::CudaAtomicAdd(in_pos, out_pos); + in_index += in_index_stride; + out_index += out_index_stride; + nc_id += nc_stride; + } + } +} + template __global__ void KeNearestNeighborInterpBw( T* in, const size_t in_img_h, const size_t in_img_w, const size_t input_h, const size_t input_w, const T* out, const size_t out_img_h, const size_t out_img_w, const size_t output_h, const size_t output_w, const size_t num_channels, const float ratio_h, const float ratio_w, - const bool align_corners, const DataLayout data_layout) { + const bool align_corners, FastDivModForInterpolate divmods) { int nthreads = output_h * output_w; int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; + int in_img_size = in_img_h * in_img_w; + int out_img_size = out_img_h * out_img_w; + for (; tid < nthreads; tid += stride) { - int out_id_h = tid / output_w; - int out_id_w = tid % output_w; - int in_img_size = input_w / num_channels; - int out_img_size = output_w / num_channels; + auto out_id_divmod = divmods.output_w_div.Divmod(tid); + int out_id_h = out_id_divmod.val[0]; + int out_id_w = out_id_divmod.val[1]; - int channel_id, out_img_idy, out_img_idx; - if (data_layout == DataLayout::kNCHW) { - channel_id = out_id_w / out_img_size; - out_img_idy = (out_id_w % out_img_size) / out_img_w; - out_img_idx = tid % out_img_w; - } else { - out_img_idy = out_id_w / (out_img_w * num_channels); - out_img_idx = out_id_w % (out_img_w * num_channels) / num_channels; - channel_id = tid % num_channels; - } + int channel_id = divmods.channels_div.Divmod(tid).val[1]; + auto outimg_id_divmod = divmods.output_wc_div.Divmod(out_id_w); + int out_img_idy = outimg_id_divmod.val[0]; + int out_img_idx = + divmods.channels_div.Divmod(outimg_id_divmod.val[1]).val[0]; int in_img_idy = (align_corners) ? static_cast(ratio_h * out_img_idy + 0.5) @@ -244,15 +278,10 @@ __global__ void KeNearestNeighborInterpBw( ? static_cast(ratio_w * out_img_idx + 0.5) : static_cast(ratio_w * out_img_idx); - T* in_pos; - if (data_layout == DataLayout::kNCHW) { - in_pos = &in[out_id_h * input_w + channel_id * in_img_size + - in_img_idy * in_img_w + in_img_idx]; - } else { - in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + - in_img_idx * num_channels + channel_id]; - } - const T out_pos = out[out_id_h * output_w + out_id_w]; + T* in_pos = &in[out_id_h * input_w + in_img_idy * in_img_w * num_channels + + in_img_idx * num_channels + channel_id]; + + const T out_pos = out[tid]; platform::CudaAtomicAdd(in_pos, out_pos); } } @@ -1842,11 +1871,26 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum); if ("nearest" == interp_method) { - KeNearestNeighborInterpBw< - T><<>>( - input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, - n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); + if (data_layout == DataLayout::kNCHW) { + // get launch 3D config + int nc = n * c; + platform::GpuLaunchConfig config_3d = + GetGpuLaunchConfig3D(ctx.cuda_device_context(), nc, out_h, out_w); + KeNearestNeighborInterpNCHWBw< + T><<>>( + input_grad_data, in_h, in_w, output_grad_data, out_h, out_w, nc, + ratio_h, ratio_w, align_corners); + } else { + int64_t cw = c * out_w; + auto interp_divmods = FastDivModForInterpolate(c, out_chw, cw); + KeNearestNeighborInterpBw< + T><<>>( + input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, + out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, + interp_divmods); + } } else if ("bilinear" == interp_method) { const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; From 4d042a83732b8c2d4ff9abfd3e103b6f0799831d Mon Sep 17 00:00:00 2001 From: TeFeng Chen Date: Thu, 24 Feb 2022 17:14:46 +0800 Subject: [PATCH 33/85] build a Paddle Graph from CINN compiled program for execution with PE (#39724) * build a Paddle Graph from CINN compiled program for execution with PE * update names of some variables * fix random fail in build_cinn_pass_test and update some comments * fix compiler error by merging phi pr --- .../framework/paddle2cinn/build_cinn_pass.cc | 9 +- .../framework/paddle2cinn/build_cinn_pass.h | 7 + .../paddle2cinn/build_cinn_pass_test.cc | 4 +- .../framework/paddle2cinn/cinn_compiler.cc | 6 +- paddle/fluid/operators/cinn/CMakeLists.txt | 4 +- .../operators/cinn/cinn_launch_context.cc | 249 +++++++++++++----- .../operators/cinn/cinn_launch_context.h | 94 ++++--- .../cinn/cinn_launch_context_test.cc | 241 ++++++++++++----- paddle/fluid/operators/cinn/cinn_launch_op.h | 74 ++---- paddle/fluid/operators/cinn/test_helper.h | 12 + 10 files changed, 477 insertions(+), 223 deletions(-) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc index d55950064a4a2..6e55727c8bf67 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -44,11 +44,6 @@ DECLARE_string(deny_cinn_ops); namespace paddle { namespace framework { - -namespace ir { -class MemOptVarInfo; -} // namespace ir - namespace paddle2cinn { using framework::ir::Graph; @@ -398,9 +393,7 @@ std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, kNoNeedBufferFeeds, no_need_buffer_feeds.release()); // initialize empty map for kMemOptVarInfoFromMainGraph attribute, // it will be filled on the share_mem_opt_info_to_subgraph pass - subgraph->GetOrInit>>( - kMemOptVarInfoFromMainGraph); + subgraph->GetOrInit(kMemOptVarInfoFromMainGraph); return subgraph; } diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h index 8cb920831cc54..a902eacde820f 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h @@ -18,6 +18,10 @@ limitations under the License. */ namespace paddle { namespace framework { +namespace ir { +class MemOptVarInfo; +} // namespace ir + namespace paddle2cinn { constexpr char kCinnLaunchOp[] = "cinn_launch"; @@ -27,6 +31,9 @@ constexpr char kInternalVars[] = "InternalVars"; constexpr char kOutputVars[] = "OutputVars"; constexpr char kMemOptVarInfoFromMainGraph[] = "mem_opt_var_info_from_main_graph"; +using Name2VarInfoMap = + std::unordered_map>; // A pass named BuildCinnPass, the function of this pass is: // diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc index 919fc60d4cb61..bf9d1baaf394f 100644 --- a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -255,7 +255,9 @@ TEST(BuildCinnPassTest, AllOpSupportCinn) { ASSERT_EQ( std::unordered_set(cinn_op->inputs.begin(), cinn_op->inputs.end()), std::unordered_set({v0, v1, v2, v4})); - ASSERT_EQ(cinn_op->outputs, std::vector({v6, v7})); + ASSERT_EQ(std::unordered_set(cinn_op->outputs.begin(), + cinn_op->outputs.end()), + std::unordered_set({v6, v7})); ASSERT_EQ(v1->outputs, std::vector({cinn_op})); ASSERT_EQ(v6->inputs, std::vector({cinn_op})); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc index 716cd85e7117a..706815185a1b5 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -248,10 +248,10 @@ std::unique_ptr CinnCompiler::CompileGraph( *compiled_obj = {std::move(graph_compiler), std::move(compiled_res.runtime_program), scope, symbol.var_model_to_program_map()}; - compiled_obj->launch_context = - std::make_unique( - compiled_obj->paddle2cinn_varmap, compiled_obj->scope); compiled_obj->cached_index = compiled_num; + compiled_obj->launch_context = + std::make_unique(graph, + *compiled_obj); return compiled_obj; } diff --git a/paddle/fluid/operators/cinn/CMakeLists.txt b/paddle/fluid/operators/cinn/CMakeLists.txt index a2fc080faadcf..f1247ebdf23c8 100644 --- a/paddle/fluid/operators/cinn/CMakeLists.txt +++ b/paddle/fluid/operators/cinn/CMakeLists.txt @@ -1,13 +1,13 @@ include(operators) cc_library(cinn_op_helper SRCS cinn_op_helper.cc DEPS operator device_context) -cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope cinn) +cc_library(cinn_launch_context SRCS cinn_launch_context.cc DEPS ddim lod_tensor scope proto_desc graph build_strategy parallel_executor cinn) SET(CINN_OP_DEPS string_helper cinn cinn_compiler cinn_op_helper cinn_launch_context) register_operators(DEPS ${CINN_OP_DEPS}) if (WITH_TESTING) - cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope cinn_launch_context) + cc_test(cinn_launch_context_test SRCS cinn_launch_context_test.cc DEPS ddim lod_tensor scope proto_desc graph cinn_launch_context cinn_instruction_run_op cinn) set_tests_properties(cinn_launch_context_test PROPERTIES LABELS "RUN_TYPE=CINN") SET(CINN_RUN_ENVIRONMENT "OMP_NUM_THREADS=1;runtime_include_dir=${PADDLE_BINARY_DIR}/third_party/CINN/src/external_cinn/cinn/runtime/cuda") diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc index 0b677f79f7f5d..0a21d937aa1a7 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc @@ -17,22 +17,39 @@ #include #include #include +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/instruction.h" #include "cinn/hlir/framework/scope.h" #include "cinn/hlir/framework/tensor.h" #include "cinn/runtime/cinn_runtime.h" +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/details/execution_strategy.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" +#include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" #include "paddle/phi/core/ddim.h" namespace paddle { namespace operators::details { -using LoDTensor = framework::LoDTensor; +using framework::Scope; +using framework::LoDTensor; +using framework::ParallelExecutor; +using CinnInstruction = ::cinn::hlir::framework::Instruction; +using CinnRuntimeProgram = ::cinn::hlir::framework::Program; +using framework::paddle2cinn::Name2VarInfoMap; +using framework::paddle2cinn::kMemOptVarInfoFromMainGraph; -CinnLaunchContext::CinnLaunchContext( - const std::unordered_map& paddle2cinn_varmap, - const std::shared_ptr& cinn_scope) - : cinn_scope_(cinn_scope) { - // generate all names of the cinn execution arguments +CinnLaunchContext::CinnLaunchContext(const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj) + : cinn_scope_(compiled_obj.scope) { + // collect all names of the CINN execution arguments auto var_names = cinn_scope_->var_names(); cinn_argument_names_.reserve(var_names.size()); std::transform( @@ -40,7 +57,42 @@ CinnLaunchContext::CinnLaunchContext( std::inserter(cinn_argument_names_, cinn_argument_names_.end()), [](const auto& name_view) { return std::string(name_view.data()); }); // build name map between the original variables and compiled ones - BuildVarNameMap(paddle2cinn_varmap, cinn_argument_names_); + BuildVarNameMap(compiled_obj.paddle2cinn_varmap, cinn_argument_names_); + + const auto& input_var_names = + graph.Get>(framework::paddle2cinn::kInputVars); + const auto& output_var_names = + graph.Get>(framework::paddle2cinn::kOutputVars); + internal_var_names_ = + ExtractInternalVarNames(input_var_names, output_var_names); + // check completeness of output variables in compiled result + for (auto&& var_name : output_var_names) { + PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, + platform::errors::PreconditionNotMet( + "Variable(%s) not applied in CINN", var_name)); + } + + // initialize all execution arguments + InitializeArguments(); + // DEPRECATED(CtfGo): following callback assignment will be deprecated soon + for (auto&& var_name : input_var_names) { + if (IsVariableUsed(var_name)) { + AssignExternalVariable(var_name); + } + } + for (auto&& var_name : output_var_names) { + AssignExternalVariable(var_name); + } + for (auto&& var_name : internal_var_names_) { + AssignInternalVariable(var_name); + } + + // Convert the CINN runtime program to a Paddle graph + runtime_graph_ = std::make_unique( + BuildCompiledProgram(graph, compiled_obj)); + runtime_graph_->SetNotOwned( + kMemOptVarInfoFromMainGraph, + &graph.Get(kMemOptVarInfoFromMainGraph)); } void CinnLaunchContext::BuildVarNameMap( @@ -94,21 +146,15 @@ void CinnLaunchContext::UpdateCapturedEnv(const framework::Scope& scope, << std::addressof(place); } -bool CinnLaunchContext::IsArgumentsInitialized() const { - if (hold_buffers_.empty() || name2argument_.empty()) { - return false; - } - return true; -} - bool CinnLaunchContext::IsVariableUsed(const std::string& var_name) const { return paddle2cinn_varmap_.count(var_name) > 0; } -CinnTensor CinnLaunchContext::GetCinnTensor(const std::string& arg_name) { - PADDLE_ENFORCE_GT(cinn_argument_names_.count(arg_name), 0, - platform::errors::InvalidArgument( - "Variable(%s) not found in cinn scope.", arg_name)); +CinnTensor CinnLaunchContext::GetCinnTensorOfVar(const std::string& var_name) { + PADDLE_ENFORCE_EQ( + IsVariableUsed(var_name), true, + platform::errors::NotFound("Variable(%s) not applied in CINN", var_name)); + const auto& arg_name = paddle2cinn_varmap_.at(var_name); return cinn_scope_->GetTensor(arg_name); } @@ -132,10 +178,13 @@ std::unordered_set CinnLaunchContext::ExtractInternalVarNames( return remain_var_names; } -void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name, - const LoDTensor& paddle_tensor, - const CinnTensor& cinn_tensor) { +void CinnLaunchContext::CheckTensorEquivalent( + const std::string& var_name, const framework::LoDTensor& paddle_tensor) { + PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, + platform::errors::InvalidArgument( + "Variable(%s) not applied in cinn", var_name)); // check dimension + auto cinn_tensor = GetCinnTensorOfVar(var_name); auto cinn_dims = phi::make_ddim(cinn_tensor->shape().data()); PADDLE_ENFORCE_EQ(paddle_tensor.dims(), cinn_dims, platform::errors::PreconditionNotMet( @@ -146,22 +195,28 @@ void CinnLaunchContext::CheckTensorEquivalent(const std::string& var_name, // TODO(CtfGo): check the underlying data type after CINN ready } +void CinnLaunchContext::InitializeArguments() { + for (auto&& arg : cinn_argument_names_) { + auto cinn_buffer = std::make_unique(); + auto cinn_tensor = GetCinnTensorOfVar(cinn2paddle_varmap_.at(arg)); + // assign dimensions with corresponding compiled tensor + cinn_buffer->resize(cinn_tensor->shape().data().data(), + cinn_tensor->shape().data().size()); + VLOG(4) << string::Sprintf( + "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg, + framework::DDim(cinn_buffer->dims, cinn_buffer->dimensions).to_str(), + name2argument_.size()); + name2argument_.emplace(arg, cinn_buffer.get()); + hold_buffers_.emplace_back(std::move(cinn_buffer)); + } +} + void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) { PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, platform::errors::InvalidArgument( "Variable(%s) not applied in cinn", var_name)); - const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name); - - const auto& paddle_tensor = cached_scope_->GetVar(var_name)->Get(); - CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name); - if (paddle_tensor.IsInitialized()) { - CheckTensorEquivalent(var_name, paddle_tensor, cinn_tensor); - } - - auto cinn_buffer = std::make_unique(); - // assign dimensions and alloc/free callback of cinn_buffer_t - cinn_buffer->resize(cinn_tensor->shape().data().data(), - cinn_tensor->shape().data().size()); + auto* cinn_buffer = GetCinnBufferOfVar(var_name); + // assign external malloc/free callbacks of cinn_buffer_t cinn_buffer->external_malloc = new std::function( [this, var_name](void* ctx, cinn_buffer_t* buffer) { auto* tensor = cached_scope_->GetVar(var_name)->GetMutable(); @@ -177,22 +232,14 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) { // Do nothing return 0; }); - - return AppendArgument(cinn_arg_name, std::move(cinn_buffer)); } void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) { PADDLE_ENFORCE_EQ(IsVariableUsed(var_name), true, platform::errors::InvalidArgument( "Variable(%s) not applied in cinn", var_name)); - const auto& cinn_arg_name = paddle2cinn_varmap_.at(var_name); - - CinnTensor cinn_tensor = GetCinnTensor(cinn_arg_name); - auto cinn_buffer = std::make_unique(); - // assign dimensions and alloc/free callback of cinn_buffer_t - cinn_buffer->resize(cinn_tensor->shape().data().data(), - cinn_tensor->shape().data().size()); - + auto* cinn_buffer = GetCinnBufferOfVar(var_name); + // assign external malloc/free callbacks of cinn_buffer_t cinn_buffer->external_malloc = new std::function( [this, var_name](void* ctx, cinn_buffer_t* buffer) { auto* tensor = @@ -212,30 +259,106 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) { tensor->clear(); return 0; }); - return AppendArgument(cinn_arg_name, std::move(cinn_buffer)); } -void CinnLaunchContext::AppendArgument( - const std::string& arg_name, std::unique_ptr&& buffer) { - name2argument_.emplace(arg_name, buffer.get()); - hold_buffers_.emplace_back(std::move(buffer)); - VLOG(4) << string::Sprintf( - "Append an argument:name(%s),dims(%s),argument size:(%lu)", arg_name, - framework::DDim(buffer->dims, buffer->dimensions).to_str(), - name2argument_.size()); +framework::ProgramDesc CinnLaunchContext::BuildCompiledProgram( + const framework::ir::Graph& graph, const CinnCompiledObject& compiled_obj) { + CinnRuntimeProgram* runtime_program = compiled_obj.runtime_program.get(); + // Step 0: Create an empty program_desc, there will be only one block + framework::ProgramDesc program_desc; + auto* block = program_desc.MutableBlock(0); + const std::vector>& instructions = + runtime_program->GetRunInstructions(); + + // build a map that links the name of a Paddle variable to its VarDesc + const std::unordered_set& nodes = graph.Nodes(); + std::unordered_map original_vardescs; + for (auto* node : nodes) { + if (node->IsVar() && node->Var()) { + original_vardescs.emplace(node->Name(), node->Var()); + } + } + + // Step 1: Create a VarDesc for each execution argument: + // (1) For those variables that are input or output variables of the + // original subgraph, there must exist an original VarDesc, so + // we copy some useful info(such as IsParameter,Persistable) + // to the new VarDesc. + // (2) For all variables, the shape, data type of their VarDescs + // are set by values of the corresponding compiled tensors, + // including the in/out variables where the equiality between their tensors + // and the CINN compiled ones is verified in corresponding cinn_launch_op. + for (auto&& arg : cinn_argument_names_) { + const std::string& var_name = cinn2paddle_varmap_.at(arg); + framework::VarDesc* var_desc = block->Var(var_name); + var_desc->SetType(framework::proto::VarType::LOD_TENSOR); + + auto res = original_vardescs.find(var_name); + if (res != original_vardescs.end()) { + auto* ori_desc = res->second; + var_desc->SetPersistable(ori_desc->Persistable()); + var_desc->SetIsParameter(ori_desc->IsParameter()); + } + + auto cinn_tensor = GetCinnTensorOfVar(var_name); + // TODO(CtfGo): set the corresponding data type after CINN ready, + // currently set as FP32 in default + var_desc->SetDataType(framework::proto::VarType::FP32); + var_desc->SetShape(std::vector(cinn_tensor->shape().data().begin(), + cinn_tensor->shape().data().end())); + } + + // transform names of the input or output arguments of a CINN instruction + // to the corresponding Paddle variable names, and repack them as one vector + auto trans_and_pack_args_fn = + [this](const std::vector>& cinn_args_array) { + std::vector var_names; + for (auto&& cinn_args : cinn_args_array) { + for (auto&& arg : cinn_args) { + auto res = cinn2paddle_varmap_.find(arg); + PADDLE_ENFORCE_NE( + res, cinn2paddle_varmap_.end(), + platform::errors::NotFound("Argument(%s) not found", arg)); + var_names.emplace_back(res->second); + } + } + return var_names; + }; + + // Step 2: create a VarDesc of cinn_instruction_run op for + // each CINN instruction and append it to the main block + for (auto ins_idx = 0; ins_idx < instructions.size(); ++ins_idx) { + auto* ins = instructions.at(ins_idx).get(); + auto in_args = trans_and_pack_args_fn(ins->GetInArgs()); + auto out_args = trans_and_pack_args_fn(ins->GetOutArgs()); + + auto* op_desc = block->AppendOp(); + op_desc->SetType("cinn_instruction_run"); + op_desc->SetInput(kX, in_args); + op_desc->SetOutput(kOutputs, out_args); + op_desc->SetAttr(kCachedIndex, + {static_cast(compiled_obj.cached_index)}); + op_desc->SetAttr(kInstructionIndex, {static_cast(ins_idx)}); + } + + return program_desc; } -const std::map& -CinnLaunchContext::FinalizeArguments() const { - // Check all execution parameters are assigned valued. - std::for_each(cinn_argument_names_.begin(), cinn_argument_names_.end(), - [this](const auto& arg_name) { - PADDLE_ENFORCE_GT( - name2argument_.count(arg_name), 0, - platform::errors::NotFound( - "Argument(%s) is missed for execution", arg_name)); - }); - return name2argument_; +ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place, + framework::Scope* scope) { + if (!parallel_executor_) { + framework::details::ExecutionStrategy exec_strategy; + framework::details::BuildStrategy build_strategy; + parallel_executor_ = std::make_unique( + place, scope, exec_strategy, build_strategy, runtime_graph_.get()); + } + + // update the scope bound to an OpHandle and rebuild temporary variables + std::unordered_map scope_map = { + {parallel_executor_->GetLocalScopes().front(), scope}}; + parallel_executor_->ResetOpHandleScopeMapOfGraphs(scope_map); + parallel_executor_->PrepareVariables(scope); + return parallel_executor_.get(); } cinn_buffer_t* CinnLaunchContext::GetCinnBufferOfVar( diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.h b/paddle/fluid/operators/cinn/cinn_launch_context.h index 502e6a92dc10b..a4d613ea618a8 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context.h +++ b/paddle/fluid/operators/cinn/cinn_launch_context.h @@ -21,7 +21,7 @@ #include #include #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/platform/place.h" #include "paddle/phi/core/ddim.h" @@ -35,10 +35,25 @@ class Program; } // namespace cinn::hlir::framework namespace paddle { +namespace framework { +class ProgramDesc; +class Scope; +class VarDesc; + +namespace ir { +class Graph; +} // namespace ir + +namespace paddle2cinn { +class CinnCompiledObject; +} // namespace paddle2cinn +} // namespace framework + namespace operators::details { using CinnTensor = ::cinn::hlir::framework::Tensor; using CinnScope = ::cinn::hlir::framework::Scope; +using CinnCompiledObject = framework::paddle2cinn::CinnCompiledObject; // This class is used to cache some reusable data among repeated // executions for efficiency and it also provides easy interfaces @@ -49,58 +64,71 @@ using CinnScope = ::cinn::hlir::framework::Scope; // Variable while a CINN variable is called an Argument. class CinnLaunchContext { public: - explicit CinnLaunchContext( - const std::unordered_map& paddle2cinn_varmap, - const std::shared_ptr& cinn_scope); + explicit CinnLaunchContext(const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj); + + // Initialize a ParallelExecutor to execute the runtime graph, + // it will be constructed in the first call, and just update + // the execution scope in the following usage. + framework::ParallelExecutor* InitializePE(const platform::Place& place, + framework::Scope* scope); // explicitly update several environment variables captured // by callback of execution arguments void UpdateCapturedEnv(const framework::Scope& scope, const platform::Place& place); - // Return whether execution arguments has been initialized - bool IsArgumentsInitialized() const; - // Return whether a Paddle variable used in cinn execution bool IsVariableUsed(const std::string& var_name) const; - // Assign tensor buffer to input or output variables - void AssignExternalVariable(const std::string& var_name); - - // Assign tensor buffer to internal variables - void AssignInternalVariable(const std::string& var_name); + // Check the equiality in type and dimension between the tensor + // in Paddle and the compiled tensor returned by CINN of a same variable + void CheckTensorEquivalent(const std::string& var_name, + const framework::LoDTensor& paddle_tensor); - // Extract internal variable names from all applied variables - // in execution by excluding the input and output variables - std::unordered_set ExtractInternalVarNames( - const std::vector& input_var_names, - const std::vector& output_var_names); + // Return internal variable names list + const std::unordered_set& GetInternalVarNames() const { + return internal_var_names_; + } // Finalize all execution arguments and return the name->argument map - const std::map& FinalizeArguments() const; + const std::map& FinalizeArguments() const { + return name2argument_; + } // Return the cinn_buffer_t* of a specific variable cinn_buffer_t* GetCinnBufferOfVar(const std::string& var_name); private: - // Get CinnTensor with CINN argument name - CinnTensor GetCinnTensor(const std::string& arg_name); + // Get corresponding compiled tensor of a Paddle variable name + CinnTensor GetCinnTensorOfVar(const std::string& var_name); + // Build the name maps of paddle->cinn and cinn->paddle // in reverse for all variables used in cinn execution void BuildVarNameMap( const std::unordered_map& compiled_varmap, const std::unordered_set& argument_names); - // Check whether the tensor in Paddle and the compiled - // tensor returned by CINN of a same variable - // are equivalent in type and dimension - void CheckTensorEquivalent(const std::string& var_name, - const framework::LoDTensor& paddle_tensor, - const CinnTensor& cinn_tensor); + // Extract internal variable names from all applied variables + // in execution by excluding the input and output variables + std::unordered_set ExtractInternalVarNames( + const std::vector& input_var_names, + const std::vector& output_var_names); + + // Initialize each execution argument with a cinn_buffer_t + void InitializeArguments(); - // Append an argument with (cinn name)->(cinn_buffer_t) pair - void AppendArgument(const std::string& arg_name, - std::unique_ptr&& buffer); + // Assign tensor buffer to input or output variables + void AssignExternalVariable(const std::string& var_name); + + // Assign tensor buffer to internal variables + void AssignInternalVariable(const std::string& var_name); + + // Construct a Paddle ProgramDesc with the CINN runtime + // instructions included in the compiled CINN Program + framework::ProgramDesc BuildCompiledProgram( + const framework::ir::Graph& graph, + const CinnCompiledObject& compiled_obj); private: const framework::Scope* cached_scope_ = nullptr; @@ -111,16 +139,22 @@ class CinnLaunchContext { std::unordered_map paddle2cinn_varmap_; // a name map from cinn execution arguments to paddle variables std::unordered_map cinn2paddle_varmap_; + // a list of internal variable names in Paddle + std::unordered_set internal_var_names_; // the names of the cinn arguments used in compiled executable program std::unordered_set cinn_argument_names_; // the variable scope compiled from cinn const std::shared_ptr cinn_scope_; + // the ir::Graph object converted from the program compiled by CINN + std::unique_ptr runtime_graph_; + // a ParallelExecutor to execute the runtime graph + std::unique_ptr parallel_executor_; + // because a cinn_pod_value_t does not own a cinn_buffer_t object, // an extra stroage is necessary to keep those objects and they can // not be released until the runtime program finish execution. std::vector> hold_buffers_; - // this map saves all execution arguments with their cinn names as key, // and it is passed to the Execute interface of a cinn runtime program. std::map name2argument_; diff --git a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc index 58a9c5db712b9..4976a59d1dd38 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_context_test.cc +++ b/paddle/fluid/operators/cinn/cinn_launch_context_test.cc @@ -13,87 +13,229 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/cinn/cinn_launch_context.h" +#include +#include +#include +#include "cinn/common/target.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/instruction.h" #include "cinn/hlir/framework/scope.h" #include "cinn/hlir/framework/tensor.h" #include "cinn/runtime/cinn_runtime.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/cinn/cinn_op_helper.h" #include "paddle/phi/core/ddim.h" +USE_OP(cinn_instruction_run); namespace paddle { namespace operators::details { -using LoDTensor = framework::LoDTensor; +using framework::OpDesc; +using framework::ProgramDesc; +using framework::LoDTensor; +using framework::ir::Graph; +using framework::ParallelExecutor; +using framework::paddle2cinn::Name2VarInfoMap; using CinnShape = ::cinn::hlir::framework::Shape; +using CinnInstruction = ::cinn::hlir::framework::Instruction; +using CinnRuntimeProgram = ::cinn::hlir::framework::Program; -std::unique_ptr CreateDefaultLaunchContext() { +const Graph& InitDefaultSubgraph() { static std::once_flag initialized; - static std::unordered_map paddle2cinn_varmap; - static std::shared_ptr cinn_scope; - std::call_once(initialized, [&paddle2cinn_varmap, &cinn_scope]() { - auto& scope = cinn_scope; - scope = std::make_shared(); + static std::unique_ptr graph; + std::call_once(initialized, [&]() { + ProgramDesc program; + auto* block = program.MutableBlock(0); + auto* var1 = block->Var("var1"); + var1->SetPersistable(true); + block->Var("var2"); + block->Var("var3"); + block->Var("var4"); + auto* var5 = block->Var("var5"); + var5->SetIsParameter(true); + auto add_op = std::unique_ptr( + new OpDesc("elementwise_add", {{"X", {"var1"}}, {"Y", {"var2"}}}, + {{"Out", {"var3"}}}, {})); + block->AppendAllocatedOp(std::move(add_op)); + auto mul_op = std::unique_ptr(new OpDesc( + "mul", {{"X", {"var1"}}, {"Y", {"var2"}}}, {{"Out", {"var4"}}}, {})); + block->AppendAllocatedOp(std::move(mul_op)); + auto res_op = std::unique_ptr( + new OpDesc("elementwise_add", {{"X", {"var3"}}, {"Y", {"var4"}}}, + {{"Out", {"var5"}}}, {})); + block->AppendAllocatedOp(std::move(res_op)); + graph = std::make_unique(program); + + graph->Set>( + framework::paddle2cinn::kInputVars, + new std::vector({"var1", "var2"})); + graph->Set>( + framework::paddle2cinn::kInternalVars, + new std::vector({"var3", "var4"})); + graph->Set>( + framework::paddle2cinn::kOutputVars, + new std::vector({"var5"})); + graph->GetOrInit( + framework::paddle2cinn::kMemOptVarInfoFromMainGraph); + }); + return *graph.get(); +} +CinnCompiledObject* InitDefaultCompiledObject() { + static std::once_flag initialized; + static auto compiled_obj = std::make_unique(); + std::call_once(initialized, [result = compiled_obj.get()]() { + auto& scope = result->scope; + scope = std::make_shared(); scope->Var("cinn_var1"); scope->GetTensor("cinn_var1")->Resize(CinnShape({3, 4})); scope->Var("cinn_var2"); scope->GetTensor("cinn_var2")->Resize(CinnShape({6, 7, 8})); scope->Var("cinn_var3"); scope->GetTensor("cinn_var3")->Resize(CinnShape({10, 16})); + scope->Var("cinn_var4"); + scope->GetTensor("cinn_var4")->Resize(CinnShape({10, 16})); + scope->Var("cinn_var5"); + scope->GetTensor("cinn_var5")->Resize(CinnShape({10, 16})); - paddle2cinn_varmap = { - {"var1", "cinn_var1"}, {"var3", "cinn_var3"}, {"var4", "cinn_var4"}}; + // input variables: var1, var2; output: var5 + // internal variables: var3 and var4, here var3 is retained + // in result map, so the name will be used neither cinn_var3 + auto& paddle2cinn_varmap = result->paddle2cinn_varmap; + paddle2cinn_varmap = {{"var1", "cinn_var1"}, + {"var2", "cinn_var2"}, + {"var3", "cinn_var3"}, + {"var5", "cinn_var5"}}; + + auto& runtime_program = result->runtime_program; + std::vector> instructions; + instructions.emplace_back(new CinnInstruction( + cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var1", "cinn_var2"}, {"cinn_var3"}, "elementwise_add")); + instructions.emplace_back( + new CinnInstruction(cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var1", "cinn_var2"}, {"cinn_var4"}, "mul")); + instructions.emplace_back(new CinnInstruction( + cinn::common::DefaultHostTarget(), scope.get(), + {"cinn_var3", "cinn_var4"}, {"cinn_var5"}, "elementwise_add")); + runtime_program = + std::make_unique(scope, std::move(instructions)); + result->cached_index = 110; }); - return std::make_unique(paddle2cinn_varmap, cinn_scope); + return compiled_obj.get(); } -TEST(CinnLaunchContextTest, TestBasic) { - auto launch_context = CreateDefaultLaunchContext(); - // test IsVariableUsed +class CinnLaunchContextTest : public ::testing::Test { + public: + std::unique_ptr launch_context; + CinnCompiledObject* compiled_obj; + + void SetUp() override { + compiled_obj = InitDefaultCompiledObject(); + launch_context = std::make_unique(InitDefaultSubgraph(), + *compiled_obj); + } +}; + +TEST_F(CinnLaunchContextTest, TestConstructResult) { ASSERT_EQ(launch_context->IsVariableUsed("var1"), true); + ASSERT_EQ(launch_context->IsVariableUsed("var2"), true); + ASSERT_EQ(launch_context->IsVariableUsed("var3"), true); ASSERT_EQ(launch_context->IsVariableUsed("var4"), false); - // test UpdateCapturedEnv - platform::CPUPlace place; - framework::Scope scope; - ASSERT_NO_THROW(launch_context->UpdateCapturedEnv(scope, place)); - // test IsArgumentsInitialized - ASSERT_FALSE(launch_context->IsArgumentsInitialized()); + ASSERT_EQ(launch_context->IsVariableUsed("var5"), true); + + // check result of ExtractInternalVarNames + ASSERT_EQ(launch_context->GetInternalVarNames(), + std::unordered_set({"var3", "cinn_var4"})); + + // check completeness of arguments list, and also check + // the two name maps of the paddle->cinn and the reverse one + // through the IsVariableUsed interface + auto&& arguments = launch_context->FinalizeArguments(); + ASSERT_EQ(arguments.size(), 5); + auto check_argument_fn = [&arguments, this](const std::string& var_name, + const std::string& arg_name) { + ASSERT_EQ(launch_context->IsVariableUsed(var_name), true); + ASSERT_NO_THROW(launch_context->GetCinnBufferOfVar(var_name)); + ASSERT_GT(arguments.count(arg_name), 0); + EXPECT_EQ(launch_context->GetCinnBufferOfVar(var_name), + static_cast(arguments.at(arg_name))); + auto* buffer = launch_context->GetCinnBufferOfVar(var_name); + auto&& scope = compiled_obj->scope; + ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions), + phi::make_ddim(scope->GetTensor(arg_name)->shape().data())); + }; + check_argument_fn("var1", "cinn_var1"); + check_argument_fn("var2", "cinn_var2"); + check_argument_fn("var3", "cinn_var3"); + check_argument_fn("cinn_var4", "cinn_var4"); + check_argument_fn("var5", "cinn_var5"); } -TEST(CinnLaunchContextTest, TestCheckTensorEquivalent) { +TEST_F(CinnLaunchContextTest, TestCheckTensorEquivalent) { platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); launch_context->UpdateCapturedEnv(scope, place); auto* tensor1 = scope.Var("var1")->GetMutable(); // CheckTensorEquivalent: tensor dimension not equivalent tensor1->mutable_data(phi::make_ddim({3, 5}), place); - ASSERT_THROW(launch_context->AssignExternalVariable("var1"), + ASSERT_THROW(launch_context->CheckTensorEquivalent("var1", *tensor1), paddle::platform::EnforceNotMet); } -TEST(CinnLaunchContextTest, TestAssignVariablePreCondition) { +TEST_F(CinnLaunchContextTest, TestBuildCompiledProgram) { platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); - launch_context->UpdateCapturedEnv(scope, place); - auto* tensor4 = scope.Var("var4")->GetMutable(); + ParallelExecutor* pe = nullptr; + ASSERT_NO_THROW((pe = launch_context->InitializePE(place, &scope))); - // not used - ASSERT_THROW(launch_context->AssignExternalVariable("var4"), - paddle::platform::EnforceNotMet); - // not found - ASSERT_THROW(launch_context->AssignInternalVariable("cinn_var4"), - paddle::platform::EnforceNotMet); + // check details of program build by compiled instructions + const ProgramDesc& program = pe->Graph().OriginProgram(); + ASSERT_EQ(program.Size(), 1); + const auto& block = program.Block(0); + // vars + std::set var_names = block.LocalVarNames(); + ASSERT_EQ(var_names.size(), 5); + for (auto&& var_name : var_names) { + auto* var = block.FindVar(var_name); + ASSERT_NE(var, nullptr); + auto* buffer = launch_context->GetCinnBufferOfVar(var_name); + ASSERT_EQ(framework::DDim(buffer->dims, buffer->dimensions), + phi::make_ddim(var->GetShape())); + } + ASSERT_TRUE(block.FindVar("var1")->Persistable()); + ASSERT_FALSE(block.FindVar("var5")->Persistable()); + ASSERT_TRUE(block.FindVar("var5")->IsParameter()); + ASSERT_FALSE(block.FindVar("var1")->IsParameter()); + // ops + ASSERT_EQ(block.OpSize(), 3); + auto* op1 = block.Op(0); + ASSERT_EQ(op1->Type(), "cinn_instruction_run"); + ASSERT_EQ(op1->Input(kX), std::vector({"var1", "var2"})); + ASSERT_EQ(op1->Output(kOutputs), std::vector({"var3"})); + ASSERT_EQ(op1->GetAttrIfExists(kCachedIndex), 110); + ASSERT_EQ(op1->GetAttrIfExists(kInstructionIndex), 0); + auto* op3 = block.Op(2); + ASSERT_EQ(op3->Type(), "cinn_instruction_run"); + ASSERT_EQ(op3->Input(kX), std::vector({"var3", "cinn_var4"})); + ASSERT_EQ(op3->Output(kOutputs), std::vector({"var5"})); + ASSERT_EQ(op3->GetAttrIfExists(kCachedIndex), 110); + ASSERT_EQ(op3->GetAttrIfExists(kInstructionIndex), 2); } -TEST(CinnLaunchContextTest, TestAppendArgument) { - platform::CPUPlace cpu_place; - platform::Place place(cpu_place); +// DEPRECATED(CtfGo): following test of callback assignment +// will be deprecated after we switch to pe +TEST_F(CinnLaunchContextTest, TestCallbackAssignment) { + platform::CPUPlace place; framework::Scope scope; - auto launch_context = CreateDefaultLaunchContext(); launch_context->UpdateCapturedEnv(scope, place); // assign external variables @@ -101,33 +243,8 @@ TEST(CinnLaunchContextTest, TestAppendArgument) { float* data1 = tensor1->mutable_data(phi::make_ddim({3, 4}), place); data1[0] = 9.99f; data1[10] = 19.99f; - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var1")); - - auto* tensor3 = scope.Var("var3")->GetMutable(); - tensor3->mutable_data(phi::make_ddim({10, 16}), place); - ASSERT_NO_THROW(launch_context->AssignExternalVariable("var3")); - - // FinalizeArguments missed check - ASSERT_THROW(launch_context->FinalizeArguments(), - paddle::platform::EnforceNotMet); - // test get internal variables - auto internal_variable_names = - launch_context->ExtractInternalVarNames({"var1"}, {"var3"}); - ASSERT_EQ(internal_variable_names.size(), 1); - EXPECT_EQ(*internal_variable_names.begin(), "cinn_var2"); - - auto* tensor2 = scope.Var("var2")->GetMutable(); - tensor2->mutable_data(phi::make_ddim({6, 7, 8}), place); - ASSERT_NO_THROW(launch_context->AssignInternalVariable("cinn_var2")); - // check argument is set correctly and alloc/free callbacks work well - auto name2argument = launch_context->FinalizeArguments(); - ASSERT_EQ(name2argument.size(), 3); - ASSERT_EQ(name2argument.count("cinn_var1"), 1); - ASSERT_TRUE(launch_context->IsArgumentsInitialized()); - - auto* cinn_buffer = - static_cast(name2argument.at("cinn_var1")); + auto* cinn_buffer = launch_context->GetCinnBufferOfVar("var1"); ASSERT_EQ(cinn_buffer->memory, nullptr); cinn_buffer->external_malloc->operator()(nullptr, cinn_buffer); ASSERT_NE(cinn_buffer->memory, nullptr); diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h index 1db9f2f25e270..cf3b98c6679b8 100644 --- a/paddle/fluid/operators/cinn/cinn_launch_op.h +++ b/paddle/fluid/operators/cinn/cinn_launch_op.h @@ -105,63 +105,29 @@ class CinnLaunchOpKernel : public framework::OpKernel { auto* launch_context = cinn_compiled_object.launch_context.get(); // Step 3. Prepare arguments needed for the compiled executable program. launch_context->UpdateCapturedEnv(scope, place); - if (!launch_context->IsArgumentsInitialized()) { - VLOG(4) << "CinnLaunchOp prepare arguments"; - - // 3.1 Prepare input variables: tensors of input variables have - // been initialized before graph compiled, just check the - // equiality between tensors of paddle and cinn. - for (const auto& var_name : input_no_need_buffer_variable_names) { - // the input variable declared as 'no need buffer' can not be used - PADDLE_ENFORCE_EQ( - launch_context->IsVariableUsed(var_name), false, - platform::errors::InvalidArgument( - "Input variable(%s) should not be used by cinn in execution", - var_name)); - } - - for (const auto& var_name : input_x_variable_names) { - // some input variables don't need for cinn because they are - // eliminated by optimized passes or some cinn operators use - // less variables - if (!launch_context->IsVariableUsed(var_name)) { - VLOG(4) << "Input variable" << var_name << " not used by cinn"; - continue; - } - - launch_context->AssignExternalVariable(var_name); - } - - // 3.2 Prepare output variables: all output variables should - // be initialized and allocated buffer before - // the runtime program start execution, the compilation result - // includes details of their buffer assginment and we use that to - // allocate space in Paddle. For those variables allocated yet, - // like persistable parameters, just check the equiality between - // Paddle allocation and CINN buffer assginment. - auto output_variable_names = ctx.OutputNames(kOutputs); - for (const auto var_name : output_variable_names) { - PADDLE_ENFORCE_EQ( - launch_context->IsVariableUsed(var_name), true, - platform::errors::InvalidArgument( - "Output variable(%s) not used by cinn", var_name)); - - launch_context->AssignExternalVariable(var_name); - } - - // 3.3 Prepare internal or temporary variables: Create a temporary - // scope to keep internal variables within graph or temporary - // variables needed by the compiled runtime program in addition. - // Here we directly use the names from CinnScope as Paddle variable - // names, because they will not be used outside the graph - // and should be destructed after computation finished. - auto internal_variable_names = launch_context->ExtractInternalVarNames( - input_x_variable_names, output_variable_names); - for (const auto& var_name : internal_variable_names) { - launch_context->AssignInternalVariable(var_name); + // 3.1 Input variables: tensors of input variables have + // been initialized before graph compiled, just check the + // equiality between tensors of paddle and cinn. + for (const auto& var_name : input_x_variable_names) { + // some input variables don't need for cinn because they are + // eliminated by optimized passes or some cinn operators use + // less variables + if (!launch_context->IsVariableUsed(var_name)) { + VLOG(4) << "Input variable" << var_name << " not used by cinn"; + continue; } + launch_context->CheckTensorEquivalent(var_name, + *inputs_name2tensor.at(var_name)); } + // 3.2 Output variables: the output variables will be initialized + // and allocated buffer in callbacks which are defined in the + // external_malloc/free interface of cinn_buffer_t + // in their corresponding arguments. + // 3.3 Internal variables: A temporary scope is created in + // UpdateCapturedEnv to keep the internal variables and + // they are also initialized through callbacks + // Step 4. Set CINN runtime FLAGS, such as FLAGS_cinn_cudnn_deterministic. details::SetCinnRuntimeFlags(); diff --git a/paddle/fluid/operators/cinn/test_helper.h b/paddle/fluid/operators/cinn/test_helper.h index eb3d725d554b1..9720a5309fa6e 100644 --- a/paddle/fluid/operators/cinn/test_helper.h +++ b/paddle/fluid/operators/cinn/test_helper.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" #include "paddle/fluid/framework/scope.h" #include "paddle/phi/core/ddim.h" @@ -31,6 +32,7 @@ using LoDTensor = framework::LoDTensor; using Variable = framework::Variable; using Graph = framework::ir::Graph; using Node = framework::ir::Node; +using framework::paddle2cinn::Name2VarInfoMap; std::unique_ptr CreateOnlyElementwiseAddGraph( const std::string& x_name, const std::string& y_name, @@ -71,6 +73,16 @@ std::unique_ptr CreateOnlyElementwiseAddGraph( y_node->inputs = {feed_op_node_y}; y_node->outputs = {elementwise_add_node}; out_node->inputs = {elementwise_add_node}; + // set necessary attributes + g->Set>( + framework::paddle2cinn::kInputVars, + new std::vector({x_name, y_name})); + g->Set>(framework::paddle2cinn::kInternalVars, + new std::vector({})); + g->Set>(framework::paddle2cinn::kOutputVars, + new std::vector({out_name})); + g->GetOrInit( + framework::paddle2cinn::kMemOptVarInfoFromMainGraph); return g; } From 127440c3c540c2327c7aa570427ab1d59b8a3518 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Thu, 24 Feb 2022 17:38:22 +0800 Subject: [PATCH 34/85] [phi] move randint to phi (#39872) * move randint to phi * use host generator --- paddle/fluid/operators/randint_op.cc | 34 ---------- paddle/fluid/operators/randint_op.cu | 84 ------------------------ paddle/phi/kernels/cpu/randint_kernel.cc | 63 ++++++++++++++++++ paddle/phi/kernels/gpu/randint_kernel.cu | 80 ++++++++++++++++++++++ paddle/phi/kernels/randint_kernel.h | 39 +++++++++++ paddle/phi/ops/compat/randint_sig.cc | 63 ++++++++++++++++++ 6 files changed, 245 insertions(+), 118 deletions(-) delete mode 100644 paddle/fluid/operators/randint_op.cu create mode 100644 paddle/phi/kernels/cpu/randint_kernel.cc create mode 100644 paddle/phi/kernels/gpu/randint_kernel.cu create mode 100644 paddle/phi/kernels/randint_kernel.h create mode 100644 paddle/phi/ops/compat/randint_sig.cc diff --git a/paddle/fluid/operators/randint_op.cc b/paddle/fluid/operators/randint_op.cc index 09c58cd7d4cda..548e28716dd91 100644 --- a/paddle/fluid/operators/randint_op.cc +++ b/paddle/fluid/operators/randint_op.cc @@ -24,37 +24,6 @@ namespace paddle { namespace operators { -template -class CPURandintKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - std::vector new_shape; - auto list_new_shape_tensor = - ctx.MultiInput("ShapeTensorList"); - if (list_new_shape_tensor.size() > 0 || ctx.HasInput("ShapeTensor")) { - if (ctx.HasInput("ShapeTensor")) { - auto* shape_tensor = ctx.Input("ShapeTensor"); - new_shape = GetNewDataFromShapeTensor(shape_tensor); - } else if (list_new_shape_tensor.size() > 0) { - new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor); - } - } - auto* out = ctx.Output("Out"); - if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape)); - T* data = out->mutable_data(ctx.GetPlace()); - int64_t size = out->numel(); - - std::uniform_int_distribution dist(ctx.Attr("low"), - ctx.Attr("high") - 1); - unsigned int seed = static_cast(ctx.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - } -}; - class RandintOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -176,6 +145,3 @@ REGISTER_OPERATOR( randint, ops::RandintOp, ops::RandintOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker) - -REGISTER_OP_CPU_KERNEL(randint, ops::CPURandintKernel, - ops::CPURandintKernel) diff --git a/paddle/fluid/operators/randint_op.cu b/paddle/fluid/operators/randint_op.cu deleted file mode 100644 index 2f9a8cfd142ec..0000000000000 --- a/paddle/fluid/operators/randint_op.cu +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -#include -#include -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/uniform_random_op.h" - -namespace paddle { -namespace operators { - -template -class GPURandintKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - std::vector new_shape; - auto list_new_shape_tensor = - context.MultiInput("ShapeTensorList"); - if (list_new_shape_tensor.size() > 0 || context.HasInput("ShapeTensor")) { - if (context.HasInput("ShapeTensor")) { - auto* shape_tensor = context.Input("ShapeTensor"); - new_shape = GetNewDataFromShapeTensor(shape_tensor); - } else if (list_new_shape_tensor.size() > 0) { - new_shape = GetNewDataFromShapeTensorList(list_new_shape_tensor); - } - } - - platform::CPUPlace cpu; - auto dtype = static_cast( - context.Attr("dtype")); - auto* out = context.Output("Out"); - if (!new_shape.empty()) out->Resize(phi::make_ddim(new_shape)); - T low = static_cast(context.Attr("low")); - T high = static_cast(context.Attr("high")) - 1; - framework::LoDTensor tensor; - tensor.Resize(out->dims()); - tensor.mutable_data(cpu, framework::TransToPtenDataType(dtype)); - T* data = tensor.mutable_data(cpu); - - int64_t size = out->numel(); - unsigned int seed = static_cast(context.Attr("seed")); - - /* - std::minstd_rand engine; - if (seed == 0) { - std::random_device rd; - seed = rd(); - } - engine.seed(seed); - */ - - std::uniform_int_distribution<> dist(context.Attr("low"), - context.Attr("high") - 1); - auto engine = framework::GetCPURandomEngine(seed); - - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } - - if (platform::is_gpu_place(context.GetPlace())) { - // Copy tensor to out - framework::TensorCopy(tensor, context.GetPlace(), out); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(randint, ops::GPURandintKernel, - ops::GPURandintKernel) diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc new file mode 100644 index 0000000000000..5fe56b57452d5 --- /dev/null +++ b/paddle/phi/kernels/cpu/randint_kernel.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/randint_kernel.h" + +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void RandintRawKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + int seed, + DenseTensor* out) { + out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); + auto size = out->numel(); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = ctx.GetGenerator()->GetCPUEngine(); + } + std::uniform_int_distribution dist(low, high - 1); + auto data = out->data(); + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +template +void RandintKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + DenseTensor* out) { + RandintRawKernel(ctx, low, high, shape, dtype, 0, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + randint_raw, CPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {} +PD_REGISTER_KERNEL(randint, CPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) { +} diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu new file mode 100644 index 0000000000000..b89b714c73d92 --- /dev/null +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -0,0 +1,80 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/randint_kernel.h" + +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" + +namespace phi { + +template +void RandintRawKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + int seed, + DenseTensor* out) { + DenseTensor tmp; + tmp.Resize(phi::make_ddim(shape.GetData())); + T* tmp_data = ctx.template HostAlloc(&tmp); + + out->ResizeAndAllocate(tmp.dims()); + auto size = out->numel(); + + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = ctx.GetHostGenerator()->GetCPUEngine(); + } + std::uniform_int_distribution dist(low, high - 1); + auto data = out->data(); + for (int64_t i = 0; i < size; ++i) { + tmp_data[i] = dist(*engine); + } + + paddle::memory::Copy( + out->place(), + data, + tmp.place(), + tmp_data, + size * paddle::experimental::SizeOf(out->dtype()), + 0); +} + +template +void RandintKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + DenseTensor* out) { + RandintRawKernel(ctx, low, high, shape, dtype, 0, out); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + randint_raw, GPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {} + +PD_REGISTER_KERNEL(randint, GPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) { +} diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h new file mode 100644 index 0000000000000..1a78e73d863e3 --- /dev/null +++ b/paddle/phi/kernels/randint_kernel.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void RandintKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + DenseTensor* out); + +template +void RandintRawKernel(const Context& ctx, + int low, + int high, + const ScalarArray& shape, + DataType dtype, + int seed, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/randint_sig.cc b/paddle/phi/ops/compat/randint_sig.cc new file mode 100644 index 0000000000000..eb6da78a258bc --- /dev/null +++ b/paddle/phi/ops/compat/randint_sig.cc @@ -0,0 +1,63 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature RandintOpArgumentMapping(const ArgumentMappingContext& ctx) { + int seed = paddle::any_cast(ctx.Attr("seed")); + if (seed) { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "randint_raw", + {}, + {"low", "high", "ShapeTensorList", "seed", "dtype"}, + {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature("randint_raw", + {}, + {"low", "high", "ShapeTensor", "seed", "dtype"}, + {"Out"}); + } else { + return KernelSignature("randint_raw", + {}, + {"low", "high", "shape", "seed", "dtype"}, + {"Out"}); + } + } + } else { + if (ctx.InputSize("ShapeTensorList") > 0) { + return KernelSignature( + "randint", {}, {"low", "high", "ShapeTensorList", "dtype"}, {"Out"}); + } else { + const auto& shape = + paddle::any_cast>(ctx.Attr("shape")); + if (ctx.HasInput("ShapeTensor") && shape.empty()) { + return KernelSignature( + "randint", {}, {"low", "high", "ShapeTensor", "dtype"}, {"Out"}); + } else { + return KernelSignature( + "randint", {}, {"low", "high", "shape", "dtype"}, {"Out"}); + } + } + } +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(randint, phi::RandintOpArgumentMapping); From b695fd958a3b5ea3cbc4ee766d2867839129b4d3 Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Thu, 24 Feb 2022 17:48:37 +0800 Subject: [PATCH 35/85] [phi]migrate increment addmm multinomial cholesky kernels to phi (#39858) * migrate increment addmm multinomial cholesky kernels to phi * test pr39869 * test pr39869 * fix style and ci --- paddle/fluid/operators/addmm_op.cc | 12 +- paddle/fluid/operators/addmm_op.h | 195 --------- paddle/fluid/operators/cholesky_op.cc | 10 +- paddle/fluid/operators/cholesky_op.cu | 169 -------- paddle/fluid/operators/cholesky_op.h | 374 ------------------ paddle/fluid/operators/increment_op.cc | 15 +- paddle/fluid/operators/increment_op.h | 41 -- paddle/fluid/operators/increment_op_npu.cc | 2 +- paddle/fluid/operators/multinomial_op.cc | 28 -- paddle/fluid/operators/multinomial_op.cu | 270 ------------- paddle/phi/kernels/addmm_grad_kernel.h | 33 ++ .../kernels/addmm_kernel.h} | 24 +- paddle/phi/kernels/cholesky_grad_kernel.h | 28 ++ paddle/phi/kernels/cholesky_kernel.h | 27 ++ paddle/phi/kernels/cpu/addmm_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/addmm_kernel.cc | 21 + .../phi/kernels/cpu/cholesky_grad_kernel.cc | 22 ++ paddle/phi/kernels/cpu/cholesky_kernel.cc | 81 ++++ paddle/phi/kernels/cpu/increment_kernel.cc | 28 ++ paddle/phi/kernels/cpu/multinomial_kernel.cc | 46 +++ paddle/phi/kernels/gpu/addmm_grad_kernel.cu | 22 ++ paddle/phi/kernels/gpu/addmm_kernel.cu | 21 + .../phi/kernels/gpu/cholesky_grad_kernel.cu | 22 ++ paddle/phi/kernels/gpu/cholesky_kernel.cu | 217 ++++++++++ paddle/phi/kernels/gpu/increment_kernel.cu | 28 ++ paddle/phi/kernels/gpu/multinomial_kernel.cu | 288 ++++++++++++++ .../phi/kernels/impl/addmm_grad_kernel_impl.h | 105 +++++ paddle/phi/kernels/impl/addmm_kernel_impl.h | 121 ++++++ .../kernels/impl/cholesky_grad_kernel_impl.h | 336 ++++++++++++++++ .../phi/kernels/impl/increment_kernel_impl.h | 37 ++ paddle/phi/kernels/increment_kernel.h | 27 ++ .../kernels/multinomial_kernel.h} | 59 ++- paddle/phi/ops/compat/addmm_sig.cc | 35 ++ paddle/phi/ops/compat/cholesky_sig.cc | 34 ++ 34 files changed, 1649 insertions(+), 1151 deletions(-) delete mode 100644 paddle/fluid/operators/addmm_op.h delete mode 100644 paddle/fluid/operators/cholesky_op.cu delete mode 100644 paddle/fluid/operators/cholesky_op.h delete mode 100644 paddle/fluid/operators/increment_op.h delete mode 100644 paddle/fluid/operators/multinomial_op.cu create mode 100644 paddle/phi/kernels/addmm_grad_kernel.h rename paddle/{fluid/operators/addmm_op.cu => phi/kernels/addmm_kernel.h} (50%) create mode 100644 paddle/phi/kernels/cholesky_grad_kernel.h create mode 100644 paddle/phi/kernels/cholesky_kernel.h create mode 100644 paddle/phi/kernels/cpu/addmm_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/addmm_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cholesky_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/cholesky_kernel.cc create mode 100644 paddle/phi/kernels/cpu/increment_kernel.cc create mode 100644 paddle/phi/kernels/cpu/multinomial_kernel.cc create mode 100644 paddle/phi/kernels/gpu/addmm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/addmm_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cholesky_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/cholesky_kernel.cu create mode 100644 paddle/phi/kernels/gpu/increment_kernel.cu create mode 100644 paddle/phi/kernels/gpu/multinomial_kernel.cu create mode 100644 paddle/phi/kernels/impl/addmm_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/addmm_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/increment_kernel_impl.h create mode 100644 paddle/phi/kernels/increment_kernel.h rename paddle/{fluid/operators/multinomial_op.h => phi/kernels/multinomial_kernel.h} (70%) create mode 100644 paddle/phi/ops/compat/addmm_sig.cc create mode 100644 paddle/phi/ops/compat/cholesky_sig.cc diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc index 915b4daeeb525..863e64c686d7b 100644 --- a/paddle/fluid/operators/addmm_op.cc +++ b/paddle/fluid/operators/addmm_op.cc @@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/addmm_op.h" #include #include #include #include +#include "paddle/fluid/framework/op_registry.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -24,6 +24,8 @@ limitations under the License. */ namespace paddle { namespace operators { +constexpr int kMULMKLDNNINT8 = 1; + using framework::OpKernelType; using framework::Tensor; @@ -227,11 +229,3 @@ REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker); REGISTER_OPERATOR(addmm_grad, ops::AddMMGradOp); - -REGISTER_OP_CPU_KERNEL( - addmm, ops::AddMMKernel, - ops::AddMMKernel); - -REGISTER_OP_CPU_KERNEL( - addmm_grad, ops::AddMMGradKernel, - ops::AddMMGradKernel); diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h deleted file mode 100644 index 9d225ba999192..0000000000000 --- a/paddle/fluid/operators/addmm_op.h +++ /dev/null @@ -1,195 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -template -using EigenTensor = framework::EigenTensor; - -using Array1 = Eigen::DSizes; -using Array2 = Eigen::DSizes; - -using Tensor = framework::Tensor; - -constexpr int kMULMKLDNNINT8 = 1; - -template -class AddMMKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* input = context.Input("Input"); - const Tensor* x = context.Input("X"); - const Tensor* y = context.Input("Y"); - - auto input_dims = input->dims(); - auto x_dims = x->dims(); - auto y_dims = y->dims(); - - // broadcast mode check - if (x_dims[0] != input_dims[0]) { - PADDLE_ENFORCE_EQ(input_dims[0], 1, - platform::errors::InvalidArgument( - "When x_dims[0] is not equal with input_dims[0], " - "input_dims[0] must be 1 but got %s", - input_dims[0])); - PADDLE_ENFORCE_EQ( - y_dims[1] == input_dims[1] || input_dims[1] == 1, true, - platform::errors::InvalidArgument( - "The input tensor shape mismatch, input shape=[%s], " - "x shape=[%s], y shape=[%s]", - input_dims, x_dims, y_dims)); - } - // broadcast mode check - if (y_dims[1] != input_dims[1]) { - PADDLE_ENFORCE_EQ(input_dims[1], 1, - platform::errors::InvalidArgument( - "When y_dims[1] is not equal with input_dims[0], " - "input_dims[0] must be 1 but got %s", - input_dims[1])); - PADDLE_ENFORCE_EQ( - x_dims[0] == input_dims[0] || input_dims[0] == 1, true, - platform::errors::InvalidArgument( - "The input tensor shape mismatch, input shape=[%s], " - "x shape=[%s], y shape=[%s]", - input_dims, x_dims, y_dims)); - } - // broadcast mode check - PADDLE_ENFORCE_EQ( - x_dims[1], y_dims[0], - platform::errors::InvalidArgument( - "The input tensor X's width must be equal with matrix Y' height. " - "But received X's shape = [%s], Y's shape = [%s].", - x_dims[1], y_dims[0])); - - auto* out = context.Output("Out"); - out->mutable_data({x_dims[0], y_dims[1]}, context.GetPlace()); - - float alpha = context.template Attr("Alpha"); - float beta = context.template Attr("Beta"); - - auto blas = phi::funcs::GetBlas(context); - - // calc broadcast dim - Array2 bcast_dims; - bcast_dims[0] = x_dims[0] / input_dims[0]; - bcast_dims[1] = y_dims[1] / input_dims[1]; - VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]"; - // broadcast using eigen - auto eigen_input = EigenTensor::From(*input); - auto eigen_out = EigenTensor::From(*out); - auto& place = - *context.template device_context().eigen_device(); - EigenBroadcast, T, 2>::Eval( - place, eigen_out, eigen_input, bcast_dims); - - blas.GEMM(false, false, x_dims[0], y_dims[1], x_dims[1], alpha, - x->data(), x_dims[1], y->data(), y_dims[1], beta, - out->data(), y_dims[1]); - } -}; - -template -class AddMMGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto in_dims = ctx.Input("Input")->dims(); - auto* dinput = - ctx.Output(framework::GradVarName("Input")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - float alpha = ctx.Attr("Alpha"); - float beta = ctx.Attr("Beta"); - - int total_elems = 0; - - VLOG(3) << "alpha: " << alpha << " beta: " << beta; - - if (dinput != nullptr) { - dinput->set_lod(dout->lod()); - } - if (dx != nullptr) { - dx->set_lod(x->lod()); - } - if (dy != nullptr) { - dy->set_lod(y->lod()); - } - - auto& dev_ctx = ctx.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - if (dinput) { - dinput->mutable_data(ctx.GetPlace()); - total_elems = in_dims[0] * in_dims[1]; - auto& place = - *ctx.template device_context().eigen_device(); - auto eigen_dout = EigenTensor::From(*dout); - auto eigen_dinput = EigenTensor::From(*dinput); - - bool row_compress = in_dims[0] != dout->dims()[0]; - bool col_compress = in_dims[1] != dout->dims()[1]; - auto eigen_dinput_shape = Array2(dinput->dims()[0], dinput->dims()[1]); - - if (row_compress && col_compress) { - eigen_dinput.device(place) = - eigen_dout.sum().eval().reshape(eigen_dinput_shape); - } else if (row_compress) { - eigen_dinput.device(place) = - eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape); - } else if (col_compress) { - eigen_dinput.device(place) = - eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape); - } else { - blas.VCOPY(total_elems, dout->data(), dinput->data()); - } - - blas.SCAL(total_elems, beta, dinput->data()); - } - if (dx) { - dx->mutable_data(ctx.GetPlace()); - total_elems = x->dims()[0] * x->dims()[1]; - // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - blas.MatMul(*dout, false, *y, true, dx); - blas.SCAL(total_elems, alpha, dx->data()); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - total_elems = x->dims()[1] * y->dims()[1]; - // dy = x' * dout. dy K x N, dout : M x N, x : M x K - blas.MatMul(*x, true, *dout, false, dy); - blas.SCAL(total_elems, alpha, dy->data()); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 0902f5b6bc9e8..93dee0df7b954 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/cholesky_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -111,11 +111,3 @@ REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, ops::CholeskyGradOpMaker); REGISTER_OPERATOR(cholesky_grad, ops::CholeskyGradOp); - -REGISTER_OP_CPU_KERNEL(cholesky, ops::CholeskyCPUKernel, - ops::CholeskyCPUKernel); - -REGISTER_OP_CPU_KERNEL( - cholesky_grad, - ops::CholeskyGradKernel, - ops::CholeskyGradKernel); diff --git a/paddle/fluid/operators/cholesky_op.cu b/paddle/fluid/operators/cholesky_op.cu deleted file mode 100644 index 43c16d607c2db..0000000000000 --- a/paddle/fluid/operators/cholesky_op.cu +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// HIP not support cusolver - -#include -#include -#include -#include "paddle/fluid/memory/memory.h" -#include "paddle/fluid/operators/cholesky_op.h" -#include "paddle/fluid/platform/dynload/cusolver.h" - -namespace paddle { -namespace operators { - -template -class CholeskyGPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto& dev_ctx = - context.template device_context(); - - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - bool upper = context.Attr("upper"); - auto& dims = x->dims(); - int batch_count = 1; - for (int i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - int m = dims[dims.size() - 1]; - int tensor_size = batch_count * m * m; - - const auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - - // matrices are assumed to be stored in column-major order in cusolver - cublasFillMode_t uplo = - upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; - // portf is inplace, thus copy the triangular part of the input matrices to - // the output and set the other triangular part to 0 firstly - platform::ForRange for_range(dev_ctx, - tensor_size); - if (upper) { - MatrixBandPartFunctor matrix_band_part_functor( - m, m, /* num_lower_diags */ 0, /* num_upper_diags */ m, x_data, - out_data); - for_range(matrix_band_part_functor); - } else { - MatrixBandPartFunctor matrix_band_part_functor( - m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, x_data, - out_data); - for_range(matrix_band_part_functor); - } - - auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count); - auto* info_ptr = reinterpret_cast(info->ptr()); - -#if CUDA_VERSION >= 9020 && !defined(_WIN32) - if (batch_count > 1) { - std::vector output_ptrs; - for (int i = 0; i < batch_count; i++) { - output_ptrs.emplace_back(out_data + i * m * m); - } - thrust::device_vector dev_output_ptrs(output_ptrs.begin(), - output_ptrs.end()); - PotrfBatched(dev_ctx, uplo, m, - thrust::raw_pointer_cast(dev_output_ptrs.data()), m, - info_ptr, batch_count); - // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need - // to clear the upper triangle of the output. Remove this workaround once - // the bug is fixed. - if (!upper) { - MatrixBandPartFunctor matrix_band_part_functor( - m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, out_data, - out_data); - for_range(matrix_band_part_functor); - } - } else { -#endif - for (int i = 0; i < batch_count; i++) { - Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i); - } - -#if CUDA_VERSION >= 9020 && !defined(_WIN32) - } -#endif - // check the info - std::vector error_info; // only for checking positive matrix - error_info.resize(batch_count); - - memory::Copy(platform::CPUPlace(), error_info.data(), dev_ctx.GetPlace(), - info_ptr, sizeof(int) * batch_count, dev_ctx.stream()); - - for (int i = 0; i < batch_count; ++i) { - PADDLE_ENFORCE_EQ(error_info[i], 0, - platform::errors::PreconditionNotMet( - "For batch [%d]: U(%d, %d) is zero, singular U.", i, - error_info[i], error_info[i])); - } - } - - void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo, - int n, T* A, int lda, int* info) const; - - void PotrfBatched(const platform::CUDADeviceContext& dev_ctx, - cublasFillMode_t uplo, int n, T* Aarray[], int lda, - int* info_array, int batch_size) const; -}; - -#define FUNC_WITH_TYPES(m) m(float, S) m(double, D) - -#define POTRF_INSTANCE(T, C) \ - template <> \ - void CholeskyGPUKernel::Potrf(const platform::CUDADeviceContext& dev_ctx, \ - cublasFillMode_t uplo, int n, T* A, \ - int lda, int* info) const { \ - auto handle = dev_ctx.cusolver_dn_handle(); \ - int workspace_size = 0; \ - PADDLE_ENFORCE_GPU_SUCCESS( \ - platform::dynload::cusolverDn##C##potrf_bufferSize( \ - handle, uplo, n, A, lda, &workspace_size)); \ - auto workspace = memory::Alloc(dev_ctx, workspace_size); \ - T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrf( \ - handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ - } - -FUNC_WITH_TYPES(POTRF_INSTANCE); - -#if CUDA_VERSION >= 9020 && !defined(_WIN32) -#define POTRF_BATCH_INSTANCE(T, C) \ - template <> \ - void CholeskyGPUKernel::PotrfBatched( \ - const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo, \ - int n, T* Aarray[], int lda, int* info_array, int batch_size) const { \ - auto handle = dev_ctx.cusolver_dn_handle(); \ - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cusolverDn##C##potrfBatched( \ - handle, uplo, n, Aarray, lda, info_array, batch_size)); \ - } - -FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE); -#endif - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL(cholesky, ops::CholeskyGPUKernel, - ops::CholeskyGPUKernel); -REGISTER_OP_CUDA_KERNEL( - cholesky_grad, - ops::CholeskyGradKernel, - ops::CholeskyGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/cholesky_op.h b/paddle/fluid/operators/cholesky_op.h deleted file mode 100644 index 9504909073f79..0000000000000 --- a/paddle/fluid/operators/cholesky_op.h +++ /dev/null @@ -1,374 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include "Eigen/Cholesky" -#include "Eigen/Core" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class CholeskyCPUKernel : public framework::OpKernel { - public: - // different with EigenMatrix in framework/eigen.h - using EigenMatrix = - Eigen::Matrix; - using InputMatrixMap = Eigen::Map; - using OutputMatrixMap = Eigen::Map; - void Compute(const framework::ExecutionContext& context) const override { - const Tensor* x = context.Input("X"); - Tensor* out = context.Output("Out"); - - bool upper = context.Attr("upper"); - auto& dims = x->dims(); - int batch_count = 1; - for (int i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - auto m = dims[dims.size() - 1]; - - const auto* x_data = x->data(); - auto* out_data = out->mutable_data(context.GetPlace()); - // Cholesky decomposition for each matrix, maybe can use multi threads - for (int i = 0; i < batch_count; i++) { - auto input = InputMatrixMap(x_data + i * m * m, m, m); - auto output = OutputMatrixMap(out_data + i * m * m, m, m); - if (upper) { - Eigen::LLT< - Eigen::Matrix, - Eigen::UpLoType::Upper> - llt_decomposition(input); - PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success, - platform::errors::InvalidArgument( - "Cholesky decomposition was not successful. The " - "%d-th input matrice " - "might not be not be positive definite.", - i)); - output = llt_decomposition.matrixU(); - } else { - Eigen::LLT< - Eigen::Matrix, - Eigen::UpLoType::Lower> - llt_decomposition(input); - PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success, - platform::errors::InvalidArgument( - "Cholesky decomposition was not successful. The " - "%d-th input matrice " - "might not be not be positive definite.", - i)); - output = llt_decomposition.matrixL(); - } - } - } -}; - -/*! Use these functors to implement tril, triu, diagonal and other operators */ -template -struct EyeFunctor { - EyeFunctor(const int m, const int n, T* output) - : m_(m), n_(n), output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int global_row = index / n_; - const int col = index - global_row * n_; - const int batch = global_row / m_; - const int row = global_row - batch * m_; - output_[index] = col == row ? static_cast(1) : static_cast(0); - } - - const int m_, n_; - T* output_; -}; - -template -struct MatrixBandPartFunctor { - /*! Set output as input value outside a central band and 0 inside that band. - * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n] - * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper - * < 0 || (n-m) <= num_upper) - */ - MatrixBandPartFunctor(const int m, const int n, const int num_lower_diags, - const int num_upper_diags, const T* input, T* output) - : m_(m), - n_(n), - num_lower_diags_(num_lower_diags), - num_upper_diags_(num_upper_diags), - input_(input), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int col = index % n_; - const int row = (index / n_) % m_; - const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); - const int band_end = - (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); - if (col < band_start || col >= band_end) { - output_[index] = static_cast(0); - } else { - output_[index] = input_[index]; - } - } - - const int m_, n_, num_lower_diags_, num_upper_diags_; - const T* input_; - T* output_; -}; - -template -struct MatrixSetDiagFunctor { - /*! Overwrite specified diagonals of output by the values in diagonal. - * diagonals can be a central band specified by num_diags and - * upper_diag_index, where upper_diag_index=0 refers to the main diagonal, - * positive value means superdiagonal and negative value means subdiagonal. - * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len] - * and the num_diags diagonals has a up to down layout. Otherwise it has a - * shape [i, j, ..., max_diag_len]. - */ - MatrixSetDiagFunctor(const int m, const int n, const int num_diags, - const int max_diag_len, const int upper_diag_index, - const T* diag, T* output) - : m_(m), - n_(n), - num_diags_(num_diags), - max_diag_len_(max_diag_len), - upper_diag_index_(upper_diag_index), - diag_(diag), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int batch_and_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_diag_index * max_diag_len_; - const int batch = batch_and_diag_index / num_diags_; - const int diag_index_in_input = batch_and_diag_index - batch * num_diags_; - // diag_index=0 refers to the main diagonal - const int diag_index = upper_diag_index_ - diag_index_in_input; - // shift down for subdiagonal if diag_index < 0 - const int y_index = - index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); - // shift right for superdiagonal if diag_index > 0 - const int x_index = - index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); - - // Upper-bound checks for diagonals shorter than max_diag_len. - // y_index and x_index are nonnegative by construction. - if (y_index < m_ && x_index < n_) { - const int out_index = batch * m_ * n_ + y_index * n_ + x_index; - output_[out_index] = diag_[index]; - } - } - - const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; - const T* diag_; - T* output_; -}; - -template -struct MatrixDiagPartFunctor { - /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0 - * refers to the main diagonal, positive value means superdiagonal and - * negative value means subdiagonal */ - MatrixDiagPartFunctor(const int m, const int n, const int num_diags, - const int max_diag_len, const int upper_diag_index, - const T padding, const T* input, T* output) - : m_(m), - n_(n), - num_diags_(num_diags), - max_diag_len_(max_diag_len), - upper_diag_index_(upper_diag_index), - input_(input), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int batch_and_mapped_diag_index = index / max_diag_len_; - const int index_in_the_diagonal = - index - batch_and_mapped_diag_index * max_diag_len_; - const int batch = batch_and_mapped_diag_index / num_diags_; - const int mapped_diag_index = - batch_and_mapped_diag_index - batch * num_diags_; - // diag_index=0 refers to the main diagonal - const int diag_index = upper_diag_index_ - mapped_diag_index; - // shift down for subdiagonal if diag_index < 0 - const int y_index = - index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); - // shift right for superdiagonal if diag_index > 0 - const int x_index = - index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); - if (y_index < m_ && x_index < n_) { - output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index]; - } else { - output_[index] = padding_; - } - } - - const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; - const T padding_; - const T* input_; - T* output_; -}; - -template -struct MatrixBandPartScaleEndFunctor { - /*! Compared with MatrixBandPartFunctor, it scale up values at the end of - * band. It can be used to fuse the following operations, which actually - * output triangular with diagonal scaled up: - * 1. dig = matrix_diag_part(middle) - * 2. middle = matrix_set_diag(middle, diag * scalar) - * 3. middle = matrix_band_part(middle, -1, 0) - */ - MatrixBandPartScaleEndFunctor(const int m, const int n, - const int num_lower_diags, - const int num_upper_diags, const T scale, - const T* input, T* output) - : m_(m), - n_(n), - num_lower_diags_(num_lower_diags), - num_upper_diags_(num_upper_diags), - scale_(scale), - input_(input), - output_(output) {} - - HOSTDEVICE void operator()(size_t index) const { - const int col = index % n_; - const int row = (index / n_) % m_; - const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); - const int band_end = - (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); - if (col < band_start || col >= band_end) { - output_[index] = 0; - } else if (col == band_end - 1) { - output_[index] = scale_ * input_[index]; - } else { - output_[index] = input_[index]; - } - } - - const int m_, n_, num_lower_diags_, num_upper_diags_; - const T scale_; - const T* input_; - T* output_; -}; - -template -struct AddtoScaleFunctor { - AddtoScaleFunctor(const T scale, const T* input, T* output) - : scale_(scale), input_(input), output_(output) {} - HOSTDEVICE void operator()(size_t index) const { - output_[index] += input_[index]; - output_[index] *= scale_; - } - const T scale_; - const T* input_; - T* output_; -}; - -template -class CholeskyGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Input("Out"); - auto* out_grad = context.Input(framework::GradVarName("Out")); - auto* x_grad = context.Output(framework::GradVarName("X")); - auto* x_grad_data = x_grad->mutable_data(context.GetPlace()); - - bool upper = context.Attr("upper"); - auto& dims = out->dims(); - int batch_count = 1; - for (int i = 0; i < dims.size() - 2; i++) { - batch_count *= dims[i]; - } - auto m = dims[dims.size() - 1]; - int tensor_size = batch_count * m * m; - - auto& dev_ctx = context.template device_context(); - - std::vector axis(dims.size() - 2); - std::iota(axis.begin(), axis.end(), 0); - axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2}); - Tensor l, l_grad; - if (upper) { - l.mutable_data(dims, context.GetPlace()); - l_grad.mutable_data(dims, context.GetPlace()); - TransCompute(dims.size(), dev_ctx, *out, &l, axis); - TransCompute(dims.size(), dev_ctx, *out_grad, &l_grad, - axis); - } else { - l = *out; - l_grad = *out_grad; - } - auto* l_data = l.data(); - - /*! refer to Iain Murray (2016); arXiv 1602.07527 */ - /*! phi = matmul(L.transpose(-1, -2), grad) */ - Tensor middle; - auto* middle_data = middle.mutable_data(dims, context.GetPlace()); - auto trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, true); - auto no_trans_desc = phi::funcs::CreateMatrixDescriptor(dims, 0, false); - auto blas = phi::funcs::GetBlas(context); - blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0)); - - /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */ - platform::ForRange for_range(dev_ctx, tensor_size); - MatrixBandPartScaleEndFunctor matrix_band_part_scale_end_functor( - m, m, /* num_lower_diags */ m, /* num_upper_diags */ 0, - /* scale */ 0.5, middle_data, middle_data); - for_range(matrix_band_part_scale_end_functor); - - // Compute inverse by solving the triangular linear system AX = B, where B - // is the identity matrix. The matrix X would be overwritten on B - Tensor identity; - auto* identity_data = identity.mutable_data(dims, context.GetPlace()); - EyeFunctor eye_functor(m, m, identity_data); - for_range(eye_functor); - // TODO(guosheng): use trsmBatched for GPU - for (int i = 0; i < batch_count; i++) { - blas.TRSM(/*side*/ CblasLeft, /*uplo*/ CblasLower, - /*trans*/ CblasNoTrans, /*diag*/ CblasNonUnit, /*m*/ m, /*n*/ m, - /*alpha*/ T(1), l_data + i * m * m, /*lda*/ m, - identity_data + i * m * m, /*ldb*/ m); - } - Tensor& l_inverse = identity; - - /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */ - Tensor middle1; - middle1.mutable_data(dims, context.GetPlace()); - blas.MatMul(l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, - T(0)); - blas.MatMul(middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, - T(0)); - - /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */ - Tensor x_grad_trans; - auto* x_grad_trans_data = - x_grad_trans.mutable_data(dims, context.GetPlace()); - TransCompute(dims.size(), dev_ctx, *x_grad, &x_grad_trans, - axis); - AddtoScaleFunctor addto_scale_functor(0.5, x_grad_trans_data, - x_grad_data); - for_range(addto_scale_functor); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index c572870d950a8..3d8e80bfaeb8f 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -12,9 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/increment_op.h" - -#include +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace framework { @@ -101,14 +99,3 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, ops::IncrementGradOpMaker); -REGISTER_OP_CPU_KERNEL( - increment, ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel); - -REGISTER_OP_CUDA_KERNEL( - increment, ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel, - ops::IncrementKernel); diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h deleted file mode 100644 index 4b9d07146484f..0000000000000 --- a/paddle/fluid/operators/increment_op.h +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace paddle { -namespace operators { - -template -class IncrementKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* x_tensor = context.Input("X"); - auto* out_tensor = context.Output("Out"); - float step = context.Attr("step"); - - out_tensor->mutable_data(context.GetPlace()); - auto& dev = - *context.template device_context().eigen_device(); - EigenAdd, T>::Eval( - dev, framework::EigenScalar::From(*out_tensor), - framework::EigenScalar::From(*x_tensor), static_cast(step)); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc index 1c7c8a19110bc..16f1b3b126995 100644 --- a/paddle/fluid/operators/increment_op_npu.cc +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/increment_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 02479222747df..00eaa2f8e77cf 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/multinomial_op.h" #include #include @@ -80,29 +79,6 @@ class MultinomialOp : public framework::OperatorWithKernel { } }; -template -class MultinomialOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); - const int64_t num_samples = ctx.Attr("num_samples"); - const bool replacement = ctx.Attr("replacement"); - - auto *in_data = x->data(); - int64_t *out_data = out->mutable_data(ctx.GetPlace()); - - auto in_dims = x->dims(); - int64_t in_rank = in_dims.size(); - const int64_t num_categories = in_dims[in_rank - 1]; - const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; - - MultinomialFunctor(out_data, in_data, num_samples, replacement, - num_categories, num_distributions); - } -}; - } // namespace operators } // namespace paddle @@ -112,7 +88,3 @@ REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL( - multinomial, ops::MultinomialOpKernel, - ops::MultinomialOpKernel); diff --git a/paddle/fluid/operators/multinomial_op.cu b/paddle/fluid/operators/multinomial_op.cu deleted file mode 100644 index a07cae8d3dabc..0000000000000 --- a/paddle/fluid/operators/multinomial_op.cu +++ /dev/null @@ -1,270 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef PADDLE_WITH_HIP -// To-do(qili93): fix this after issue resolved -// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202 - -#include -#include -#include -#include - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/multinomial_op.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/transform.h" - -namespace paddle { -namespace operators { - -template -__global__ void NormalizeProbability(T* norm_probs, const T* in_data, - T* sum_rows, int64_t num_distributions, - int64_t num_categories) { - int id = threadIdx.x + blockIdx.x * blockDim.x + - blockIdx.y * gridDim.x * blockDim.x; - if (id < num_distributions * num_categories) { - PADDLE_ENFORCE( - in_data[id] >= 0.0, - "The input of multinomial distribution should be >= 0, but got %f.", - in_data[id]); - int64_t row_id = id / num_categories; - PADDLE_ENFORCE(sum_rows[row_id] > 0.0, - "The sum of one multinomial distribution probability should " - "be > 0, but got %f.", - sum_rows[row_id]); - norm_probs[id] = in_data[id] / sum_rows[row_id]; - } -} - -template -__global__ void GetCumulativeProbs(T* norm_probs_data, - int64_t num_distributions, - int64_t num_categories, - T* cumulative_probs) { - int id = blockIdx.x; - thrust::inclusive_scan(thrust::device, norm_probs_data + id * num_categories, - norm_probs_data + (id + 1) * num_categories, - cumulative_probs + id * num_categories); -} - -template -struct RandomGeneratorCudaFunctor { - unsigned int seed_; - __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {} - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(0.0, 1.0); - rng.discard(n); - return dist(rng); - } -}; - -template -__device__ int binarySearchFunctor(T* cumulative_probs, T* norm_probs_data, - int num_categories, T rng_number) { - int left = 0; - int right = num_categories; - - while (right - left > 0) { - int mid = left + (right - left) / 2; - - T temp_prob = cumulative_probs[mid]; - if (temp_prob < rng_number) { - left = mid + 1; - } else { - right = mid; - } - } - - if (left == num_categories) { - left = num_categories - 1; - } - - while (left >= 1 && norm_probs_data[left] == 0) left--; - - return left; -} - -template -__global__ void sampleMultinomialWithReplacement( - T* rng_data, const int64_t num_samples, int64_t* out_data, - const int64_t num_distributions, const int64_t num_categories, - T* cumulative_probs, T* norm_probs_data) { - // use binary search to get the selected category sample id. - // let cumulative_probs[id-1] < rng_data < cumulative_probs[id]. - - // for every distribution - int dist = blockIdx.y; - // for every sample - int sample = blockIdx.x * blockDim.x + threadIdx.x; - if (sample < num_samples) { - T rng_number = rng_data[sample + dist * num_samples]; - - // Find the bucket that a uniform random number lies in - int selected_category = binarySearchFunctor( - cumulative_probs + dist * num_categories, - norm_probs_data + dist * num_categories, num_categories, rng_number); - - out_data[sample + dist * num_samples] = selected_category; - } -} - -template -class MultinomialOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const auto x = ctx.Input("X"); - auto out = ctx.Output("Out"); - - const int64_t num_samples = ctx.Attr("num_samples"); - const bool replacement = ctx.Attr("replacement"); - - auto* in_data = x->data(); - int64_t* out_data = out->mutable_data(ctx.GetPlace()); - - auto in_dims = x->dims(); - int64_t in_rank = in_dims.size(); - const int64_t num_categories = in_dims[in_rank - 1]; - const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; - - // If replacement is False, it's not a replaceable sample. Every category - // can - // be used only once. So after every sample, probability of the distribution - // will change. The implementation can't be parallelizable. Thus, call CPU - // implementation ``MultinomialFunctor`` to sample the distribution. - if (!replacement) { - int64_t in_data_numel = x->numel(); - int64_t out_data_numel = out->numel(); - - T* cpu_in_data = new T[in_data_numel]; - int64_t* cpu_out_data = new int64_t[out_data_numel]; - -#ifdef PADDLE_WITH_HIP - hipMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T), - hipMemcpyDeviceToHost); -#else - cudaMemcpy(cpu_in_data, in_data, in_data_numel * sizeof(T), - cudaMemcpyDeviceToHost); -#endif - - MultinomialFunctor(cpu_out_data, cpu_in_data, num_samples, replacement, - num_categories, num_distributions); - -#ifdef PADDLE_WITH_HIP - hipMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t), - hipMemcpyHostToDevice); -#else - cudaMemcpy(out_data, cpu_out_data, out_data_numel * sizeof(int64_t), - cudaMemcpyHostToDevice); -#endif - - delete[] cpu_in_data; - delete[] cpu_out_data; - return; - } - - // Sum of input may not be 1. To get probability in range [0, 1], calculate - // sum of each row of input, and then use the sum to normalize the input. - // sum_row_data: sum of each row - framework::Tensor sum_rows_tensor; - auto* sum_rows_data = - sum_rows_tensor.mutable_data({num_distributions}, ctx.GetPlace()); - - auto& place = *ctx.template device_context() - .eigen_device(); - - if (num_distributions == 1) { - auto eigen_input = framework::EigenVector::Flatten(*x); - auto eigen_sum_rows = framework::EigenVector::Flatten(sum_rows_tensor); - eigen_sum_rows.device(place) = - eigen_input.sum(Eigen::DSizes(1)) - .eval() - .reshape(Eigen::DSizes(sum_rows_tensor.dims()[0])); - } else { - auto eigen_input = framework::EigenMatrix::From(*x); - auto eigen_sum_rows = framework::EigenVector::Flatten(sum_rows_tensor); - eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes(1)); - } - - // Normalize row of each distribution to get the probability in range [0, - // 1]. - // norm_probs_data: probability of the distribution - framework::Tensor norm_probs_tensor; - auto* norm_probs_data = norm_probs_tensor.mutable_data( - {num_distributions, num_categories}, ctx.GetPlace()); - - // number of threads in a block is min(num_categories, 512) - dim3 block_norm(num_categories < 512 ? num_categories : 512); - dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1); - NormalizeProbability< - T><<>>( - norm_probs_data, in_data, sum_rows_data, num_distributions, - num_categories); - - // Get cumulative probability of each distribution. It's the same function - // of - // ``cumsum`` op. - framework::Tensor cumulative_probs_tensor; - auto* cumulative_probs = cumulative_probs_tensor.mutable_data( - {num_distributions, num_categories}, ctx.GetPlace()); - dim3 block_cumsum(1); - dim3 grid_cumsum(num_distributions); - GetCumulativeProbs<<>>( - norm_probs_data, num_distributions, num_categories, cumulative_probs); - - // Generate random number for each sample. - std::random_device rd; - auto seed = rd(); - - framework::Tensor rng_data_tensor; - auto* rng_data = rng_data_tensor.mutable_data( - {num_distributions, num_samples}, ctx.GetPlace()); - - thrust::counting_iterator index_sequence_begin(0); - platform::Transform trans; - auto* context = - static_cast(&ctx.device_context()); - trans(*context, index_sequence_begin, - index_sequence_begin + num_distributions * num_samples, rng_data, - RandomGeneratorCudaFunctor(seed)); - - // Sample the multinomial distributions. - dim3 block_sample(128); - dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions); - sampleMultinomialWithReplacement<<>>( - rng_data, num_samples, out_data, num_distributions, num_categories, - cumulative_probs, norm_probs_data); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - multinomial, ops::MultinomialOpKernel, - ops::MultinomialOpKernel); - -#endif diff --git a/paddle/phi/kernels/addmm_grad_kernel.h b/paddle/phi/kernels/addmm_grad_kernel.h new file mode 100644 index 0000000000000..0d2f445a61de0 --- /dev/null +++ b/paddle/phi/kernels/addmm_grad_kernel.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void AddmmGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + float alpha, + float beta, + DenseTensor* input_grad, + DenseTensor* x_grad, + DenseTensor* y_grad); + +} // namespace phi diff --git a/paddle/fluid/operators/addmm_op.cu b/paddle/phi/kernels/addmm_kernel.h similarity index 50% rename from paddle/fluid/operators/addmm_op.cu rename to paddle/phi/kernels/addmm_kernel.h index e42d9c84f9234..3674305796cde 100644 --- a/paddle/fluid/operators/addmm_op.cu +++ b/paddle/phi/kernels/addmm_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,13 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/addmm_op.h" +#pragma once -namespace ops = paddle::operators; -namespace plat = paddle::platform; +#include "paddle/phi/core/dense_tensor.h" -REGISTER_OP_CUDA_KERNEL(addmm, ops::AddMMKernel, - ops::AddMMKernel); -REGISTER_OP_CUDA_KERNEL(addmm_grad, - ops::AddMMGradKernel, - ops::AddMMGradKernel); +namespace phi { + +template +void AddmmKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + float alpha, + float beta, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cholesky_grad_kernel.h b/paddle/phi/kernels/cholesky_grad_kernel.h new file mode 100644 index 0000000000000..3fb532d9af7f9 --- /dev/null +++ b/paddle/phi/kernels/cholesky_grad_kernel.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CholeskyGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + bool upper, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/cholesky_kernel.h b/paddle/phi/kernels/cholesky_kernel.h new file mode 100644 index 0000000000000..5dc1473d8dbca --- /dev/null +++ b/paddle/phi/kernels/cholesky_kernel.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void CholeskyKernel(const Context& dev_ctx, + const DenseTensor& x, + bool upper, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/addmm_grad_kernel.cc b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc new file mode 100644 index 0000000000000..6032f15e0f75e --- /dev/null +++ b/paddle/phi/kernels/cpu/addmm_grad_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/addmm_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + addmm_grad, CPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/addmm_kernel.cc b/paddle/phi/kernels/cpu/addmm_kernel.cc new file mode 100644 index 0000000000000..ff86b655ed3ef --- /dev/null +++ b/paddle/phi/kernels/cpu/addmm_kernel.cc @@ -0,0 +1,21 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/addmm_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/addmm_kernel_impl.h" + +PD_REGISTER_KERNEL(addmm, CPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc new file mode 100644 index 0000000000000..ad9d51db4921e --- /dev/null +++ b/paddle/phi/kernels/cpu/cholesky_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cholesky_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + cholesky_grad, CPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/cholesky_kernel.cc b/paddle/phi/kernels/cpu/cholesky_kernel.cc new file mode 100644 index 0000000000000..3d9b6b52d75d6 --- /dev/null +++ b/paddle/phi/kernels/cpu/cholesky_kernel.cc @@ -0,0 +1,81 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/cholesky_kernel.h" + +#include "Eigen/Cholesky" +#include "Eigen/Core" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void CholeskyKernel(const Context& dev_ctx, + const DenseTensor& x, + bool upper, + DenseTensor* out) { + using EigenMatrix = + Eigen::Matrix; + using InputMatrixMap = Eigen::Map; + using OutputMatrixMap = Eigen::Map; + + auto& dims = x.dims(); + int batch_count = 1; + for (int i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + auto m = dims[dims.size() - 1]; + + const auto* x_data = x.data(); + auto* out_data = dev_ctx.template Alloc(out); + // Cholesky decomposition for each matrix, maybe can use multi threads + for (int i = 0; i < batch_count; i++) { + auto input = InputMatrixMap(x_data + i * m * m, m, m); + auto output = OutputMatrixMap(out_data + i * m * m, m, m); + if (upper) { + Eigen::LLT< + Eigen::Matrix, + Eigen::UpLoType::Upper> + llt_decomposition(input); + PADDLE_ENFORCE_EQ(llt_decomposition.info(), + Eigen::Success, + errors::InvalidArgument( + "Cholesky decomposition was not successful. The " + "%d-th input matrice " + "might not be not be positive definite.", + i)); + output = llt_decomposition.matrixU(); + } else { + Eigen::LLT< + Eigen::Matrix, + Eigen::UpLoType::Lower> + llt_decomposition(input); + PADDLE_ENFORCE_EQ(llt_decomposition.info(), + Eigen::Success, + errors::InvalidArgument( + "Cholesky decomposition was not successful. The " + "%d-th input matrice " + "might not be not be positive definite.", + i)); + output = llt_decomposition.matrixL(); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + cholesky, CPU, ALL_LAYOUT, phi::CholeskyKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/increment_kernel.cc b/paddle/phi/kernels/cpu/increment_kernel.cc new file mode 100644 index 0000000000000..70c178d25a10a --- /dev/null +++ b/paddle/phi/kernels/cpu/increment_kernel.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/increment_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/increment_kernel_impl.h" + +PD_REGISTER_KERNEL(increment, + CPU, + ALL_LAYOUT, + phi::IncrementKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/cpu/multinomial_kernel.cc b/paddle/phi/kernels/cpu/multinomial_kernel.cc new file mode 100644 index 0000000000000..67e7d5bb68c61 --- /dev/null +++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc @@ -0,0 +1,46 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/multinomial_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +void MultinomialKernel(const Context& dev_ctx, + const DenseTensor& x, + int num_samples, + bool replacement, + DenseTensor* out) { + auto* in_data = x.data(); + int64_t* out_data = dev_ctx.template Alloc(out); + auto in_dims = x.dims(); + int64_t in_rank = in_dims.size(); + const int64_t num_categories = in_dims[in_rank - 1]; + const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; + + MultinomialFunctor(out_data, + in_data, + num_samples, + replacement, + num_categories, + num_distributions); +} + +} // namespace phi + +PD_REGISTER_KERNEL( + multinomial, CPU, ALL_LAYOUT, phi::MultinomialKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/addmm_grad_kernel.cu b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu new file mode 100644 index 0000000000000..65978da1374e4 --- /dev/null +++ b/paddle/phi/kernels/gpu/addmm_grad_kernel.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/addmm_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/addmm_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + addmm_grad, GPU, ALL_LAYOUT, phi::AddmmGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/addmm_kernel.cu b/paddle/phi/kernels/gpu/addmm_kernel.cu new file mode 100644 index 0000000000000..7b589ce20acca --- /dev/null +++ b/paddle/phi/kernels/gpu/addmm_kernel.cu @@ -0,0 +1,21 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/addmm_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/addmm_kernel_impl.h" + +PD_REGISTER_KERNEL(addmm, GPU, ALL_LAYOUT, phi::AddmmKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu new file mode 100644 index 0000000000000..9165e8ea4147f --- /dev/null +++ b/paddle/phi/kernels/gpu/cholesky_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/cholesky_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + cholesky_grad, GPU, ALL_LAYOUT, phi::CholeskyGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/cholesky_kernel.cu b/paddle/phi/kernels/gpu/cholesky_kernel.cu new file mode 100644 index 0000000000000..22ea87d83e8db --- /dev/null +++ b/paddle/phi/kernels/gpu/cholesky_kernel.cu @@ -0,0 +1,217 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include "paddle/phi/kernels/cholesky_kernel.h" + +#include +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/backends/dynload/cusolver.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { + +template +struct MatrixBandPartFunctor { + /*! Set output as input value outside a central band and 0 inside that band. + * That is: output[i, j, ..., m, n] = in_band(m, n) * input[i, j, ..., m, n] + * where: in_band(m, n) = (num_lower < 0 || (m-n) <= num_lower)) && (num_upper + * < 0 || (n-m) <= num_upper) + */ + MatrixBandPartFunctor(const int m, + const int n, + const int num_lower_diags, + const int num_upper_diags, + const T* input, + T* output) + : m_(m), + n_(n), + num_lower_diags_(num_lower_diags), + num_upper_diags_(num_upper_diags), + input_(input), + output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int col = index % n_; + const int row = (index / n_) % m_; + const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); + const int band_end = + (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); + if (col < band_start || col >= band_end) { + output_[index] = static_cast(0); + } else { + output_[index] = input_[index]; + } + } + + const int m_, n_, num_lower_diags_, num_upper_diags_; + const T* input_; + T* output_; +}; + +#define FUNC_WITH_TYPES(m) m(float, S) m(double, D) + +#define POTRF_INSTANCE(T, C) \ + void Potrf(const GPUContext& dev_ctx, \ + cublasFillMode_t uplo, \ + int n, \ + T* A, \ + int lda, \ + int* info) { \ + auto handle = dev_ctx.cusolver_dn_handle(); \ + int workspace_size = 0; \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf_bufferSize( \ + handle, uplo, n, A, lda, &workspace_size)); \ + auto workspace = paddle::memory::Alloc(dev_ctx, workspace_size); \ + T* workspace_ptr = reinterpret_cast(workspace->ptr()); \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrf( \ + handle, uplo, n, A, lda, workspace_ptr, workspace_size, info)); \ + } + +FUNC_WITH_TYPES(POTRF_INSTANCE); + +#if CUDA_VERSION >= 9020 && !defined(_WIN32) +#define POTRF_BATCH_INSTANCE(T, C) \ + void PotrfBatched(const GPUContext& dev_ctx, \ + cublasFillMode_t uplo, \ + int n, \ + T* Aarray[], \ + int lda, \ + int* info_array, \ + int batch_size) { \ + auto handle = dev_ctx.cusolver_dn_handle(); \ + PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDn##C##potrfBatched( \ + handle, uplo, n, Aarray, lda, info_array, batch_size)); \ + } + +FUNC_WITH_TYPES(POTRF_BATCH_INSTANCE); +#endif + +template +void CholeskyKernel(const Context& dev_ctx, + const DenseTensor& x, + bool upper, + DenseTensor* out) { + auto& dims = x.dims(); + int batch_count = 1; + for (int i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + int m = dims[dims.size() - 1]; + int tensor_size = batch_count * m * m; + + const auto* x_data = x.data(); + auto* out_data = dev_ctx.template Alloc(out); + + // matrices are assumed to be stored in column-major order in cusolver + cublasFillMode_t uplo = + upper ? CUBLAS_FILL_MODE_LOWER : CUBLAS_FILL_MODE_UPPER; + // portf is inplace, thus copy the triangular part of the input matrices to + // the output and set the other triangular part to 0 firstly + paddle::platform::ForRange for_range(dev_ctx, tensor_size); + if (upper) { + MatrixBandPartFunctor matrix_band_part_functor(m, + m, + /* num_lower_diags */ 0, + /* num_upper_diags */ m, + x_data, + out_data); + for_range(matrix_band_part_functor); + } else { + MatrixBandPartFunctor matrix_band_part_functor(m, + m, + /* num_lower_diags */ m, + /* num_upper_diags */ 0, + x_data, + out_data); + for_range(matrix_band_part_functor); + } + + auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batch_count); + auto* info_ptr = reinterpret_cast(info->ptr()); + +#if CUDA_VERSION >= 9020 && !defined(_WIN32) + if (batch_count > 1) { + std::vector output_ptrs; + for (int i = 0; i < batch_count; i++) { + output_ptrs.emplace_back(out_data + i * m * m); + } + thrust::device_vector dev_output_ptrs(output_ptrs.begin(), + output_ptrs.end()); + PotrfBatched(dev_ctx, + uplo, + m, + thrust::raw_pointer_cast(dev_output_ptrs.data()), + m, + info_ptr, + batch_count); + // TODO(guosheng): There seems to a bug in cusolver potrfBatched and need + // to clear the upper triangle of the output. Remove this workaround once + // the bug is fixed. + if (!upper) { + MatrixBandPartFunctor matrix_band_part_functor(m, + m, + /* num_lower_diags */ m, + /* num_upper_diags */ 0, + out_data, + out_data); + for_range(matrix_band_part_functor); + } + } else { +#endif + for (int i = 0; i < batch_count; i++) { + Potrf(dev_ctx, uplo, m, out_data + i * m * m, m, info_ptr + i); + } + +#if CUDA_VERSION >= 9020 && !defined(_WIN32) + } +#endif + // check the info + std::vector error_info; // only for checking positive matrix + error_info.resize(batch_count); + + paddle::memory::Copy(CPUPlace(), + error_info.data(), + dev_ctx.GetPlace(), + info_ptr, + sizeof(int) * batch_count, + dev_ctx.stream()); + + for (int i = 0; i < batch_count; ++i) { + PADDLE_ENFORCE_EQ(error_info[i], + 0, + errors::PreconditionNotMet( + "For batch [%d]: U(%d, %d) is zero, singular U.", + i, + error_info[i], + error_info[i])); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(cholesky, // cuda_only + GPU, + ALL_LAYOUT, + phi::CholeskyKernel, + float, + double) {} + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/increment_kernel.cu b/paddle/phi/kernels/gpu/increment_kernel.cu new file mode 100644 index 0000000000000..b3c3127191148 --- /dev/null +++ b/paddle/phi/kernels/gpu/increment_kernel.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/increment_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/increment_kernel_impl.h" + +PD_REGISTER_KERNEL(increment, + GPU, + ALL_LAYOUT, + phi::IncrementKernel, + float, + double, + int, + int64_t) {} diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu new file mode 100644 index 0000000000000..ea1cf361958aa --- /dev/null +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -0,0 +1,288 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_WITH_HIP +// To-do(qili93): fix this after issue resolved +// https://github.com/ROCmSoftwarePlatform/rocPRIM/issues/202 + +#include "paddle/phi/kernels/multinomial_kernel.h" + +#include +#include +#include +#include + +#include "paddle/fluid/platform/transform.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +__global__ void NormalizeProbability(T* norm_probs, + const T* in_data, + T* sum_rows, + int64_t num_distributions, + int64_t num_categories) { + int id = threadIdx.x + blockIdx.x * blockDim.x + + blockIdx.y * gridDim.x * blockDim.x; + if (id < num_distributions * num_categories) { + PADDLE_ENFORCE( + in_data[id] >= 0.0, + "The input of multinomial distribution should be >= 0, but got %f.", + in_data[id]); + int64_t row_id = id / num_categories; + PADDLE_ENFORCE(sum_rows[row_id] > 0.0, + "The sum of one multinomial distribution probability should " + "be > 0, but got %f.", + sum_rows[row_id]); + norm_probs[id] = in_data[id] / sum_rows[row_id]; + } +} + +template +__global__ void GetCumulativeProbs(T* norm_probs_data, + int64_t num_distributions, + int64_t num_categories, + T* cumulative_probs) { + int id = blockIdx.x; + thrust::inclusive_scan(thrust::device, + norm_probs_data + id * num_categories, + norm_probs_data + (id + 1) * num_categories, + cumulative_probs + id * num_categories); +} + +template +struct RandomGeneratorCudaFunctor { + unsigned int seed_; + __host__ __device__ RandomGeneratorCudaFunctor(int seed) : seed_(seed) {} + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(0.0, 1.0); + rng.discard(n); + return dist(rng); + } +}; + +template +__device__ int binarySearchFunctor(T* cumulative_probs, + T* norm_probs_data, + int num_categories, + T rng_number) { + int left = 0; + int right = num_categories; + + while (right - left > 0) { + int mid = left + (right - left) / 2; + + T temp_prob = cumulative_probs[mid]; + if (temp_prob < rng_number) { + left = mid + 1; + } else { + right = mid; + } + } + + if (left == num_categories) { + left = num_categories - 1; + } + + while (left >= 1 && norm_probs_data[left] == 0) left--; + + return left; +} + +template +__global__ void sampleMultinomialWithReplacement( + T* rng_data, + const int64_t num_samples, + int64_t* out_data, + const int64_t num_distributions, + const int64_t num_categories, + T* cumulative_probs, + T* norm_probs_data) { + // use binary search to get the selected category sample id. + // let cumulative_probs[id-1] < rng_data < cumulative_probs[id]. + + // for every distribution + int dist = blockIdx.y; + // for every sample + int sample = blockIdx.x * blockDim.x + threadIdx.x; + if (sample < num_samples) { + T rng_number = rng_data[sample + dist * num_samples]; + + // Find the bucket that a uniform random number lies in + int selected_category = + binarySearchFunctor(cumulative_probs + dist * num_categories, + norm_probs_data + dist * num_categories, + num_categories, + rng_number); + + out_data[sample + dist * num_samples] = selected_category; + } +} + +template +void MultinomialKernel(const Context& dev_ctx, + const DenseTensor& x, + int num_samples, + bool replacement, + DenseTensor* out) { + auto* in_data = x.data(); + int64_t* out_data = dev_ctx.template Alloc(out); + + auto in_dims = x.dims(); + int64_t in_rank = in_dims.size(); + const int64_t num_categories = in_dims[in_rank - 1]; + const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; + + // If replacement is False, it's not a replaceable sample. Every category + // can + // be used only once. So after every sample, probability of the distribution + // will change. The implementation can't be parallelizable. Thus, call CPU + // implementation ``MultinomialFunctor`` to sample the distribution. + if (!replacement) { + int64_t in_data_numel = x.numel(); + int64_t out_data_numel = out->numel(); + + T* cpu_in_data = new T[in_data_numel]; + int64_t* cpu_out_data = new int64_t[out_data_numel]; + +#ifdef PADDLE_WITH_HIP + hipMemcpy( + cpu_in_data, in_data, in_data_numel * sizeof(T), hipMemcpyDeviceToHost); +#else + cudaMemcpy(cpu_in_data, + in_data, + in_data_numel * sizeof(T), + cudaMemcpyDeviceToHost); +#endif + + MultinomialFunctor(cpu_out_data, + cpu_in_data, + num_samples, + replacement, + num_categories, + num_distributions); + +#ifdef PADDLE_WITH_HIP + hipMemcpy(out_data, + cpu_out_data, + out_data_numel * sizeof(int64_t), + hipMemcpyHostToDevice); +#else + cudaMemcpy(out_data, + cpu_out_data, + out_data_numel * sizeof(int64_t), + cudaMemcpyHostToDevice); +#endif + + delete[] cpu_in_data; + delete[] cpu_out_data; + return; + } + + // Sum of input may not be 1. To get probability in range [0, 1], calculate + // sum of each row of input, and then use the sum to normalize the input. + // sum_row_data: sum of each row + DenseTensor sum_rows_tensor; + sum_rows_tensor.Resize({num_distributions}); + auto* sum_rows_data = dev_ctx.template Alloc(&sum_rows_tensor); + + auto& place = *dev_ctx.eigen_device(); + + if (num_distributions == 1) { + auto eigen_input = EigenVector::Flatten(x); + auto eigen_sum_rows = EigenVector::Flatten(sum_rows_tensor); + eigen_sum_rows.device(place) = + eigen_input.sum(Eigen::DSizes(1)) + .eval() + .reshape(Eigen::DSizes(sum_rows_tensor.dims()[0])); + } else { + auto eigen_input = EigenMatrix::From(x); + auto eigen_sum_rows = EigenVector::Flatten(sum_rows_tensor); + eigen_sum_rows.device(place) = eigen_input.sum(Eigen::DSizes(1)); + } + + // Normalize row of each distribution to get the probability in range [0, + // 1]. + // norm_probs_data: probability of the distribution + DenseTensor norm_probs_tensor; + norm_probs_tensor.Resize({num_distributions, num_categories}); + auto* norm_probs_data = dev_ctx.template Alloc(&norm_probs_tensor); + + // number of threads in a block is min(num_categories, 512) + dim3 block_norm(num_categories < 512 ? num_categories : 512); + dim3 grid_norm((num_distributions * num_categories - 1) / block_norm.x + 1); + NormalizeProbability<<>>( + norm_probs_data, + in_data, + sum_rows_data, + num_distributions, + num_categories); + + // Get cumulative probability of each distribution. It's the same function + // of + // ``cumsum`` op. + DenseTensor cumulative_probs_tensor; + cumulative_probs_tensor.Resize({num_distributions, num_categories}); + auto* cumulative_probs = dev_ctx.template Alloc(&cumulative_probs_tensor); + + dim3 block_cumsum(1); + dim3 grid_cumsum(num_distributions); + GetCumulativeProbs<<>>( + norm_probs_data, num_distributions, num_categories, cumulative_probs); + + // Generate random number for each sample. + std::random_device rd; + auto seed = rd(); + + DenseTensor rng_data_tensor; + rng_data_tensor.Resize({num_distributions, num_samples}); + auto* rng_data = dev_ctx.template Alloc(&rng_data_tensor); + + thrust::counting_iterator index_sequence_begin(0); + paddle::platform::Transform trans; + trans(dev_ctx, + index_sequence_begin, + index_sequence_begin + num_distributions * num_samples, + rng_data, + RandomGeneratorCudaFunctor(seed)); + + // Sample the multinomial distributions. + dim3 block_sample(128); + dim3 grid_sample((num_samples - 1) / block_sample.x + 1, num_distributions); + sampleMultinomialWithReplacement< + T><<>>(rng_data, + num_samples, + out_data, + num_distributions, + num_categories, + cumulative_probs, + norm_probs_data); +} + +} // namespace phi + +PD_REGISTER_KERNEL(multinomial, // cuda_only + GPU, + ALL_LAYOUT, + phi::MultinomialKernel, + float, + double) {} + +#endif diff --git a/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h new file mode 100644 index 0000000000000..d5efd22a31daa --- /dev/null +++ b/paddle/phi/kernels/impl/addmm_grad_kernel_impl.h @@ -0,0 +1,105 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/addmm_grad_kernel.h" + +#include +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using PhiEigenTensor = EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; + +template +void AddmmGradKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& out_grad, + float alpha, + float beta, + DenseTensor* input_grad, + DenseTensor* x_grad, + DenseTensor* y_grad) { + auto in_dims = input.dims(); + int total_elems = 0; + + VLOG(3) << "alpha: " << alpha << " beta: " << beta; + + if (input_grad != nullptr) { + input_grad->set_lod(out_grad.lod()); + } + if (x_grad != nullptr) { + x_grad->set_lod(x.lod()); + } + if (y_grad != nullptr) { + y_grad->set_lod(y.lod()); + } + + auto blas = funcs::GetBlas(dev_ctx); + if (input_grad) { + dev_ctx.template Alloc(input_grad); + total_elems = in_dims[0] * in_dims[1]; + auto& place = *dev_ctx.eigen_device(); + auto eigen_dout = PhiEigenTensor::From(out_grad); + auto eigen_dinput = PhiEigenTensor::From(*input_grad); + + bool row_compress = in_dims[0] != out_grad.dims()[0]; + bool col_compress = in_dims[1] != out_grad.dims()[1]; + auto eigen_dinput_shape = + Array2(input_grad->dims()[0], input_grad->dims()[1]); + + if (row_compress && col_compress) { + eigen_dinput.device(place) = + eigen_dout.sum().eval().reshape(eigen_dinput_shape); + } else if (row_compress) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(0)).eval().reshape(eigen_dinput_shape); + } else if (col_compress) { + eigen_dinput.device(place) = + eigen_dout.sum(Array1(1)).eval().reshape(eigen_dinput_shape); + } else { + blas.VCOPY(total_elems, out_grad.data(), input_grad->data()); + } + + blas.SCAL(total_elems, beta, input_grad->data()); + } + if (x_grad) { + dev_ctx.template Alloc(x_grad); + total_elems = x.dims()[0] * x.dims()[1]; + // x_grad = out_grad * y'. x_grad: M x K, out_grad : M x N, y : K x N + blas.MatMul(out_grad, false, y, true, x_grad); + blas.SCAL(total_elems, alpha, x_grad->data()); + } + if (y_grad) { + dev_ctx.template Alloc(y_grad); + total_elems = x.dims()[1] * y.dims()[1]; + // y_grad = x' * out_grad. y_grad K x N, out_grad : M x N, x : M x K + blas.MatMul(x, true, out_grad, false, y_grad); + blas.SCAL(total_elems, alpha, y_grad->data()); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/addmm_kernel_impl.h b/paddle/phi/kernels/impl/addmm_kernel_impl.h new file mode 100644 index 0000000000000..f7afdfd622e63 --- /dev/null +++ b/paddle/phi/kernels/impl/addmm_kernel_impl.h @@ -0,0 +1,121 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/addmm_kernel.h" + +#include +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" + +namespace phi { + +template +using PhiEigenTensor = EigenTensor; + +using Array1 = Eigen::DSizes; +using Array2 = Eigen::DSizes; + +template +void AddmmKernel(const Context& dev_ctx, + const DenseTensor& input, + const DenseTensor& x, + const DenseTensor& y, + float alpha, + float beta, + DenseTensor* out) { + auto input_dims = input.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + // broadcast mode check + if (x_dims[0] != input_dims[0]) { + PADDLE_ENFORCE_EQ(input_dims[0], + 1, + errors::InvalidArgument( + "When x_dims[0] is not equal with input_dims[0], " + "input_dims[0] must be 1 but got %s", + input_dims[0])); + PADDLE_ENFORCE_EQ(y_dims[1] == input_dims[1] || input_dims[1] == 1, + true, + errors::InvalidArgument( + "The input tensor shape mismatch, input shape=[%s], " + "x shape=[%s], y shape=[%s]", + input_dims, + x_dims, + y_dims)); + } + // broadcast mode check + if (y_dims[1] != input_dims[1]) { + PADDLE_ENFORCE_EQ(input_dims[1], + 1, + errors::InvalidArgument( + "When y_dims[1] is not equal with input_dims[0], " + "input_dims[0] must be 1 but got %s", + input_dims[1])); + PADDLE_ENFORCE_EQ(x_dims[0] == input_dims[0] || input_dims[0] == 1, + true, + errors::InvalidArgument( + "The input tensor shape mismatch, input shape=[%s], " + "x shape=[%s], y shape=[%s]", + input_dims, + x_dims, + y_dims)); + } + // broadcast mode check + PADDLE_ENFORCE_EQ( + x_dims[1], + y_dims[0], + errors::InvalidArgument( + "The input tensor X's width must be equal with matrix Y' height. " + "But received X's shape = [%s], Y's shape = [%s].", + x_dims[1], + y_dims[0])); + + dev_ctx.template Alloc(out); + auto blas = funcs::GetBlas(dev_ctx); + + // calc broadcast dim + Array2 bcast_dims; + bcast_dims[0] = x_dims[0] / input_dims[0]; + bcast_dims[1] = y_dims[1] / input_dims[1]; + VLOG(3) << "bcast_dims=[" << bcast_dims[0] << "," << bcast_dims[1] << "]"; + // broadcast using eigen + auto eigen_input = PhiEigenTensor::From(input); + auto eigen_out = PhiEigenTensor::From(*out); + auto& place = *dev_ctx.eigen_device(); + funcs::EigenBroadcast, T, 2>::Eval( + place, eigen_out, eigen_input, bcast_dims); + + blas.GEMM(false, + false, + x_dims[0], + y_dims[1], + x_dims[1], + alpha, + x.data(), + x_dims[1], + y.data(), + y_dims[1], + beta, + out->data(), + y_dims[1]); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h new file mode 100644 index 0000000000000..b8df86cc69344 --- /dev/null +++ b/paddle/phi/kernels/impl/cholesky_grad_kernel_impl.h @@ -0,0 +1,336 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/cholesky_grad_kernel.h" + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +inline void TransCompute(const int dim, + const Context& dev_ctx, + const DenseTensor& in, + DenseTensor* out, + const std::vector& axis) { + switch (dim) { + case 1: + funcs::Transpose trans1; + trans1(dev_ctx, in, out, axis); + break; + case 2: + funcs::Transpose trans2; + trans2(dev_ctx, in, out, axis); + break; + case 3: + funcs::Transpose trans3; + trans3(dev_ctx, in, out, axis); + break; + case 4: + funcs::Transpose trans4; + trans4(dev_ctx, in, out, axis); + break; + case 5: + funcs::Transpose trans5; + trans5(dev_ctx, in, out, axis); + break; + case 6: + funcs::Transpose trans6; + trans6(dev_ctx, in, out, axis); + break; + default: + // for dim >= 7 situation + funcs::TransposeNormal trans_normal; + trans_normal(dev_ctx, in, out, axis); + } +} + +/*! Use these functors to implement tril, triu, diagonal and other operators */ +template +struct EyeFunctor { + EyeFunctor(const int m, const int n, T* output) + : m_(m), n_(n), output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int global_row = index / n_; + const int col = index - global_row * n_; + const int batch = global_row / m_; + const int row = global_row - batch * m_; + output_[index] = col == row ? static_cast(1) : static_cast(0); + } + + const int m_, n_; + T* output_; +}; + +template +struct MatrixSetDiagFunctor { + /*! Overwrite specified diagonals of output by the values in diagonal. + * diagonals can be a central band specified by num_diags and + * upper_diag_index, where upper_diag_index=0 refers to the main diagonal, + * positive value means superdiagonal and negative value means subdiagonal. + * When it is a band, `diag` has a shape [i, j, ..., num_diags, max_diag_len] + * and the num_diags diagonals has a up to down layout. Otherwise it has a + * shape [i, j, ..., max_diag_len]. + */ + MatrixSetDiagFunctor(const int m, + const int n, + const int num_diags, + const int max_diag_len, + const int upper_diag_index, + const T* diag, + T* output) + : m_(m), + n_(n), + num_diags_(num_diags), + max_diag_len_(max_diag_len), + upper_diag_index_(upper_diag_index), + diag_(diag), + output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int batch_and_diag_index = index / max_diag_len_; + const int index_in_the_diagonal = + index - batch_and_diag_index * max_diag_len_; + const int batch = batch_and_diag_index / num_diags_; + const int diag_index_in_input = batch_and_diag_index - batch * num_diags_; + // diag_index=0 refers to the main diagonal + const int diag_index = upper_diag_index_ - diag_index_in_input; + // shift down for subdiagonal if diag_index < 0 + const int y_index = + index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); + // shift right for superdiagonal if diag_index > 0 + const int x_index = + index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); + + // Upper-bound checks for diagonals shorter than max_diag_len. + // y_index and x_index are nonnegative by construction. + if (y_index < m_ && x_index < n_) { + const int out_index = batch * m_ * n_ + y_index * n_ + x_index; + output_[out_index] = diag_[index]; + } + } + + const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; + const T* diag_; + T* output_; +}; + +template +struct MatrixDiagPartFunctor { + /*! Similar to MatrixSetDiagFunctor but return the diagonals. diag_index=0 + * refers to the main diagonal, positive value means superdiagonal and + * negative value means subdiagonal */ + MatrixDiagPartFunctor(const int m, + const int n, + const int num_diags, + const int max_diag_len, + const int upper_diag_index, + const T padding, + const T* input, + T* output) + : m_(m), + n_(n), + num_diags_(num_diags), + max_diag_len_(max_diag_len), + upper_diag_index_(upper_diag_index), + input_(input), + output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int batch_and_mapped_diag_index = index / max_diag_len_; + const int index_in_the_diagonal = + index - batch_and_mapped_diag_index * max_diag_len_; + const int batch = batch_and_mapped_diag_index / num_diags_; + const int mapped_diag_index = + batch_and_mapped_diag_index - batch * num_diags_; + // diag_index=0 refers to the main diagonal + const int diag_index = upper_diag_index_ - mapped_diag_index; + // shift down for subdiagonal if diag_index < 0 + const int y_index = + index_in_the_diagonal + (0 > -diag_index ? 0 : -diag_index); + // shift right for superdiagonal if diag_index > 0 + const int x_index = + index_in_the_diagonal + (0 > diag_index ? 0 : diag_index); + if (y_index < m_ && x_index < n_) { + output_[index] = input_[batch * m_ * n_ + y_index * m_ + x_index]; + } else { + output_[index] = padding_; + } + } + + const int m_, n_, num_diags_, max_diag_len_, upper_diag_index_; + const T padding_; + const T* input_; + T* output_; +}; + +template +struct MatrixBandPartScaleEndFunctor { + /*! Compared with MatrixBandPartFunctor, it scale up values at the end of + * band. It can be used to fuse the following operations, which actually + * output triangular with diagonal scaled up: + * 1. dig = matrix_diag_part(middle) + * 2. middle = matrix_set_diag(middle, diag * scalar) + * 3. middle = matrix_band_part(middle, -1, 0) + */ + MatrixBandPartScaleEndFunctor(const int m, + const int n, + const int num_lower_diags, + const int num_upper_diags, + const T scale, + const T* input, + T* output) + : m_(m), + n_(n), + num_lower_diags_(num_lower_diags), + num_upper_diags_(num_upper_diags), + scale_(scale), + input_(input), + output_(output) {} + + HOSTDEVICE void operator()(size_t index) const { + const int col = index % n_; + const int row = (index / n_) % m_; + const int band_start = (num_lower_diags_ < 0 ? 0 : row - num_lower_diags_); + const int band_end = + (num_upper_diags_ < 0 ? n_ : row + num_upper_diags_ + 1); + if (col < band_start || col >= band_end) { + output_[index] = 0; + } else if (col == band_end - 1) { + output_[index] = scale_ * input_[index]; + } else { + output_[index] = input_[index]; + } + } + + const int m_, n_, num_lower_diags_, num_upper_diags_; + const T scale_; + const T* input_; + T* output_; +}; + +template +struct AddtoScaleFunctor { + AddtoScaleFunctor(const T scale, const T* input, T* output) + : scale_(scale), input_(input), output_(output) {} + HOSTDEVICE void operator()(size_t index) const { + output_[index] += input_[index]; + output_[index] *= scale_; + } + const T scale_; + const T* input_; + T* output_; +}; + +template +void CholeskyGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + bool upper, + DenseTensor* x_grad) { + auto* x_grad_data = dev_ctx.template Alloc(x_grad); + + auto& dims = out.dims(); + int batch_count = 1; + for (int i = 0; i < dims.size() - 2; i++) { + batch_count *= dims[i]; + } + auto m = dims[dims.size() - 1]; + int tensor_size = batch_count * m * m; + + std::vector axis(dims.size() - 2); + std::iota(axis.begin(), axis.end(), 0); + axis.insert(axis.end(), {dims.size() - 1, dims.size() - 2}); + DenseTensor l, l_grad; + if (upper) { + l.Resize(dims); + dev_ctx.template Alloc(&l); + l_grad.Resize(dims); + dev_ctx.template Alloc(&l_grad); + TransCompute(dims.size(), dev_ctx, out, &l, axis); + TransCompute(dims.size(), dev_ctx, out_grad, &l_grad, axis); + } else { + l = out; + l_grad = out_grad; + } + auto* l_data = l.data(); + + /*! refer to Iain Murray (2016); arXiv 1602.07527 */ + /*! phi = matmul(L.transpose(-1, -2), grad) */ + DenseTensor middle; + middle.Resize(dims); + auto* middle_data = dev_ctx.template Alloc(&middle); + auto trans_desc = funcs::CreateMatrixDescriptor(dims, 0, true); + auto no_trans_desc = funcs::CreateMatrixDescriptor(dims, 0, false); + auto blas = funcs::GetBlas(dev_ctx); + blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0)); + + /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */ + paddle::platform::ForRange for_range(dev_ctx, tensor_size); + MatrixBandPartScaleEndFunctor matrix_band_part_scale_end_functor( + m, + m, + /* num_lower_diags */ m, + /* num_upper_diags */ 0, + /* scale */ 0.5, + middle_data, + middle_data); + for_range(matrix_band_part_scale_end_functor); + + // Compute inverse by solving the triangular linear system AX = B, where B + // is the identity matrix. The matrix X would be overwritten on B + DenseTensor identity; + identity.Resize(dims); + auto* identity_data = dev_ctx.template Alloc(&identity); + EyeFunctor eye_functor(m, m, identity_data); + for_range(eye_functor); + // TODO(guosheng): use trsmBatched for GPU + for (int i = 0; i < batch_count; i++) { + blas.TRSM(/*side*/ CblasLeft, + /*uplo*/ CblasLower, + /*trans*/ CblasNoTrans, + /*diag*/ CblasNonUnit, + /*m*/ m, + /*n*/ m, + /*alpha*/ T(1), + l_data + i * m * m, + /*lda*/ m, + identity_data + i * m * m, + /*ldb*/ m); + } + DenseTensor& l_inverse = identity; + + /*! x_grad = matmul(matmul(L_inverse.transpose(-1, -2), phi), L_inverse) */ + DenseTensor middle1; + middle1.Resize(dims); + dev_ctx.template Alloc(&middle1); + blas.MatMul( + l_inverse, trans_desc, middle, no_trans_desc, T(1), &middle1, T(0)); + blas.MatMul( + middle1, no_trans_desc, l_inverse, no_trans_desc, T(1), x_grad, T(0)); + + /*! x_grad.add(x_grad.transpose(-1, -2)).mul_(0.5) */ + DenseTensor x_grad_trans; + x_grad_trans.Resize(dims); + auto* x_grad_trans_data = dev_ctx.template Alloc(&x_grad_trans); + TransCompute(dims.size(), dev_ctx, *x_grad, &x_grad_trans, axis); + AddtoScaleFunctor addto_scale_functor(0.5, x_grad_trans_data, x_grad_data); + for_range(addto_scale_functor); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/increment_kernel_impl.h b/paddle/phi/kernels/impl/increment_kernel_impl.h new file mode 100644 index 0000000000000..0756807a87532 --- /dev/null +++ b/paddle/phi/kernels/impl/increment_kernel_impl.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/eigen/eigen_function.h" +#include "paddle/phi/kernels/increment_kernel.h" + +namespace phi { + +template +void IncrementKernel(const Context& dev_ctx, + const DenseTensor& x, + float value, + DenseTensor* out) { + dev_ctx.template Alloc(out); + auto& dev = *dev_ctx.eigen_device(); + funcs::EigenAdd, T>::Eval( + dev, + EigenScalar::From(*out), + EigenScalar::From(x), + static_cast(value)); +} + +} // namespace phi diff --git a/paddle/phi/kernels/increment_kernel.h b/paddle/phi/kernels/increment_kernel.h new file mode 100644 index 0000000000000..7c5bc2a202791 --- /dev/null +++ b/paddle/phi/kernels/increment_kernel.h @@ -0,0 +1,27 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void IncrementKernel(const Context& ctx, + const DenseTensor& x, + float value, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/fluid/operators/multinomial_op.h b/paddle/phi/kernels/multinomial_kernel.h similarity index 70% rename from paddle/fluid/operators/multinomial_op.h rename to paddle/phi/kernels/multinomial_kernel.h index 077e0e0ffa57e..70be21dc2861f 100644 --- a/paddle/fluid/operators/multinomial_op.h +++ b/paddle/phi/kernels/multinomial_kernel.h @@ -1,31 +1,30 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #pragma once -#include + +#include "paddle/phi/core/dense_tensor.h" #include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/core/hostdevice.h" -namespace paddle { -namespace operators { +namespace phi { -/** - * Samples a multinomial distribution given a probability input - */ +template +void MultinomialKernel(const Context& dev_ctx, + const DenseTensor& x, + int num_samples, + bool replacement, + DenseTensor* out); template void MultinomialFunctor(int64_t* out_data, const T* in_data, @@ -35,7 +34,7 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data, std::vector cumulative_probs(num_categories); std::uniform_real_distribution dist(0, 1); - auto gen_ptr = framework::DefaultCPUGenerator(); + auto gen_ptr = paddle::framework::DefaultCPUGenerator(); auto engine = gen_ptr->GetCPUEngine(); for (int64_t i = 0; i < num_distributions; i++) { @@ -45,7 +44,7 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data, for (int64_t j = 0; j < num_categories; j++) { prob_value = in_data[i * num_categories + j]; PADDLE_ENFORCE_GE(prob_value, 0.0, - platform::errors::InvalidArgument( + errors::InvalidArgument( "The input of multinomial distribution " "should be >= 0, but got %f.", prob_value)); @@ -57,13 +56,13 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data, cumulative_probs[j] = probs_sum; } PADDLE_ENFORCE_GT(probs_sum, 0.0, - platform::errors::InvalidArgument( + errors::InvalidArgument( "The sum of one multinomial distribution " "probability should be > 0, but got %f.", probs_sum)); PADDLE_ENFORCE_EQ( (replacement || (num_categories - num_zeros >= num_samples)), true, - platform::errors::InvalidArgument( + errors::InvalidArgument( "When replacement is False, number of " "samples should be less than non-zero " "categories.")); @@ -121,8 +120,4 @@ void MultinomialFunctor(int64_t* out_data, const T* in_data, } } -template -class MultinomialOpKernel; - -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc new file mode 100644 index 0000000000000..34da5fe9fe954 --- /dev/null +++ b/paddle/phi/ops/compat/addmm_sig.cc @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "addmm", {"Input", "X", "Y"}, {"Alpha", "Beta"}, {"Out"}); +} + +KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "addmm_grad", + {"Input", "X", "Y", GradVarName("Out")}, + {"Alpha", "Beta"}, + {GradVarName("Input"), GradVarName("X"), GradVarName("Y")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc new file mode 100644 index 0000000000000..068c7f4f0a77a --- /dev/null +++ b/paddle/phi/ops/compat/cholesky_sig.cc @@ -0,0 +1,34 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature CholeskyOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("cholesky", {"X"}, {"upper"}, {"Out"}); +} + +KernelSignature CholeskyGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("cholesky_grad", + {"Out", GradVarName("Out")}, + {"upper"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(cholesky, phi::CholeskyOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping); From c5ae43a2676f275b32543a281a555004182a98d6 Mon Sep 17 00:00:00 2001 From: ronnywang <524019753@qq.com> Date: Thu, 24 Feb 2022 18:57:32 +0800 Subject: [PATCH 36/85] fix paddle.where torch diff (#39859) --- .../fluid/tests/unittests/test_where_op.py | 31 +++++++++++++++++++ python/paddle/tensor/search.py | 10 ++++-- 2 files changed, 39 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_where_op.py b/python/paddle/fluid/tests/unittests/test_where_op.py index d601117b96f12..7fb4d39cd7338 100644 --- a/python/paddle/fluid/tests/unittests/test_where_op.py +++ b/python/paddle/fluid/tests/unittests/test_where_op.py @@ -139,6 +139,28 @@ def test_api_broadcast(self, use_cuda=False): fetch_list=[result]) assert np.array_equal(out[0], np.where((x_i > 1), x_i, y_i)) + def test_scalar(self): + paddle.enable_static() + main_program = Program() + with fluid.program_guard(main_program): + cond_shape = [2, 4] + cond = fluid.layers.data( + name='cond', shape=cond_shape, dtype='bool') + x_data = 1.0 + y_data = 2.0 + cond_data = np.array([False, False, True, True]).astype('bool') + result = paddle.where(condition=cond, x=x_data, y=y_data) + for use_cuda in [False, True]: + if (use_cuda and (not fluid.core.is_compiled_with_cuda())): + return + place = (fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()) + exe = fluid.Executor(place) + out = exe.run(fluid.default_main_program(), + feed={'cond': cond_data}, + fetch_list=[result]) + expect = np.where(cond_data, x_data, y_data) + assert np.array_equal(out[0], expect) + def __test_where_with_broadcast_static(self, cond_shape, x_shape, y_shape): paddle.enable_static() main_program = Program() @@ -227,6 +249,15 @@ def test_api(self): out = paddle.where(cond, x, y) assert np.array_equal(out.numpy(), np.where(cond_i, x_i, y_i)) + def test_scalar(self): + with fluid.dygraph.guard(): + cond_i = np.array([False, False, True, True]).astype('bool') + x = 1.0 + y = 2.0 + cond = fluid.dygraph.to_variable(cond_i) + out = paddle.where(cond, x, y) + assert np.array_equal(out.numpy(), np.where(cond_i, x, y)) + def __test_where_with_broadcast_dygraph(self, cond_shape, a_shape, b_shape): with fluid.dygraph.guard(): cond_tmp = paddle.rand(cond_shape) diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 5c5517e54f71a..ecf70ffe4a1dd 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -543,8 +543,8 @@ def where(condition, x=None, y=None, name=None): Args: condition(Tensor): The condition to choose x or y. - x(Tensor, optional): x is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given. - y(Tensor, optional): y is a Tensor with data type float32, float64, int32, int64. Either both or neither of x and y should be given. + x(Tensor or Scalar, optional): x is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. + y(Tensor or Scalar, optional): y is a Tensor or Scalar with data type float32, float64, int32, int64. Either both or neither of x and y should be given. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please @@ -571,6 +571,12 @@ def where(condition, x=None, y=None, name=None): # [[2], # [3]]),) """ + if np.isscalar(x): + x = layers.fill_constant([1], np.array([x]).dtype.name, x) + + if np.isscalar(y): + y = layers.fill_constant([1], np.array([y]).dtype.name, y) + if x is None and y is None: return nonzero(condition, as_tuple=True) From ce207c3aba1b1c3eebcc7fb7cb1ba3da2f0c460b Mon Sep 17 00:00:00 2001 From: zn <96479180+kangna-qi@users.noreply.github.com> Date: Thu, 24 Feb 2022 19:04:44 +0800 Subject: [PATCH 37/85] [MLU]add mlu kernel for allreduce (#39788) --- .../operators/collective/c_allreduce_op.h | 68 +++++++++++++++++- .../collective/c_allreduce_sum_op_mlu.cc | 26 +++++++ .../collective/c_broadcast_op_mlu.cc | 3 +- .../fluid/tests/unittests/mlu/CMakeLists.txt | 1 + .../unittests/mlu/collective_allreduce_op.py | 70 +++++++++++++++++++ .../mlu/test_collective_allreduce.py | 55 +++++++++++++++ .../unittests/mlu/test_collective_base_mlu.py | 8 +++ 7 files changed, 228 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index a04935d43eb2d..7e5120cd2b392 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -23,8 +23,9 @@ limitations under the License. */ #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ - defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \ + defined(PADDLE_WITH_CNCL) #include "paddle/fluid/platform/collective_helper.h" #endif @@ -45,6 +46,10 @@ limitations under the License. */ #include "paddle/fluid/platform/device/npu/hccl_helper.h" #endif +#if defined(PADDLE_WITH_CNCL) +#include "paddle/fluid/platform/device/mlu/cncl_helper.h" +#endif + #if defined(PADDLE_WITH_ASCEND_CL) DECLARE_bool(hccl_check_nan); #endif @@ -398,6 +403,65 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel { } }; +template +class CAllReduceOpMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_CNCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(in->type())); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::CNCLCommContext::Instance().Get(rid, place); + + mluStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + cnclReduceOp_t cncl_red_type = cnclSum; + switch (red_type) { + case kRedSum: + cncl_red_type = cnclSum; + break; + + case kRedMax: + cncl_red_type = cnclMax; + break; + + case kRedMin: + cncl_red_type = cnclMin; + break; + + case kRedProd: + cncl_red_type = cnclProd; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce( + sendbuff, recvbuff, numel, dtype, cncl_red_type, comm->comm(), stream)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with MLU.")); +#endif + } +}; + class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() { diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc new file mode 100644 index 0000000000000..4879696b3f470 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_mlu.cc @@ -0,0 +1,26 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(c_allreduce_sum, + ops::CAllReduceOpMLUKernel, + ops::CAllReduceOpMLUKernel, + ops::CAllReduceOpMLUKernel, + ops::CAllReduceOpMLUKernel, + ops::CAllReduceOpMLUKernel, + ops::CAllReduceOpMLUKernel) diff --git a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc index 123fb2aafb524..d315f211709e4 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op_mlu.cc @@ -30,7 +30,8 @@ class CBroadcastOPMLUKernel : public framework::OpKernel { auto x = ctx.Input("X"); auto out = ctx.Output("Out"); int numel = x->numel(); - cnclDataType_t dtype = platform::ToCNCLDataType(x->type()); + cnclDataType_t dtype = + platform::ToCNCLDataType(framework::TransToProtoVarType(x->type())); int rid = ctx.Attr("ring_id"); auto place = ctx.GetPlace(); diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index 2e588355ce793..41f3a31017e7f 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -6,4 +6,5 @@ if (WITH_MLU) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py new file mode 100644 index 0000000000000..0371e1bbb2406 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/collective_allreduce_op.py @@ -0,0 +1,70 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import argparse +import os +import sys +import signal +import time +import socket +from contextlib import closing +from six import string_types +import math +import paddle +import paddle.fluid as fluid +import paddle.fluid.profiler as profiler +import paddle.fluid.unique_name as nameGen +from paddle.fluid import core +import unittest +from multiprocessing import Process +import paddle.fluid.layers as layers +from functools import reduce +from test_collective_base_mlu import TestCollectiveRunnerBase, runtime_main + +paddle.enable_static() + + +class TestCollectiveAllreduce(TestCollectiveRunnerBase): + def __init__(self): + self.global_ring_id = 0 + + def get_model(self, main_prog, startup_program): + ring_id = 0 + with fluid.program_guard(main_prog, startup_program): + tindata = layers.data( + name="tindata", shape=[10, 1000], dtype='float32') + toutdata = main_prog.current_block().create_var( + name="outofallreduce", + dtype='float32', + type=core.VarDesc.VarType.LOD_TENSOR, + persistable=False, + stop_gradient=False) + main_prog.global_block().append_op( + type="c_allreduce_sum", + inputs={'X': tindata}, + attrs={'ring_id': ring_id}, + outputs={'Out': toutdata}) + main_prog.global_block().append_op( + type="c_sync_comm_stream", + inputs={'X': toutdata}, + outputs={'Out': toutdata}, + attrs={'ring_id': ring_id}) + return toutdata + + +if __name__ == "__main__": + runtime_main(TestCollectiveAllreduce, "allreduce", 0) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py new file mode 100644 index 0000000000000..5fd5db7a604d5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_allreduce.py @@ -0,0 +1,55 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import sys +import unittest +import numpy as np +import paddle + +from test_collective_base_mlu import TestDistBase + +paddle.enable_static() + + +class TestCAllreduceOp(TestDistBase): + def _setup_config(self): + pass + + def test_allreduce_fp32(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", + "float32") + + def test_allreduce_fp16(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", + "float16") + + def test_allreduce_int32(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", + "int32") + + def test_allreduce_int16(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", + "int16") + + def test_allreduce_int8(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", "int8") + + def test_allreduce_uint8(self): + self.check_with_place("collective_allreduce_op.py", "allreduce", + "uint8") + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py index 2a7c64fe48972..4692c893d00b4 100644 --- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py +++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py @@ -262,5 +262,13 @@ def check_with_place(self, need_result = input2 self.assertTrue(np.allclose(tr0_out, need_result)) self.assertTrue(np.allclose(tr1_out, need_result)) + elif col_type == "allreduce": + need_result = input1 + input2 + self.assertTrue( + np.allclose( + tr0_out, need_result, rtol=1e-05, atol=1e-05)) + self.assertTrue( + np.allclose( + tr1_out, need_result, rtol=1e-05, atol=1e-05)) else: pass From 1255e7d6fa6a6ca75821273c5839a657cd1a4757 Mon Sep 17 00:00:00 2001 From: Wangzheee <634486483@qq.com> Date: Thu, 24 Feb 2022 20:34:11 +0800 Subject: [PATCH 38/85] [Paddle-Inference] fix special_slice plugin (#39875) * fix plugin: special slice for ernie --- .../tensorrt/plugin/special_slice_plugin.cu | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu index ecf06e9bf1513..324e9c0392c93 100644 --- a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu @@ -113,12 +113,12 @@ nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType( template __global__ void SpecialSliceKernel(const T* slice_input, const int32_t* cu_seqlens, T* output) { - const int hidden = blockDim.x * gridDim.y; - const int batch = blockIdx.x; - const int local_idx = blockIdx.y * blockDim.y + threadIdx.x; + const int hidden = blockDim.x * gridDim.x; + const int hidden_id = blockIdx.x * blockDim.x + threadIdx.x; + const int batch_id = blockIdx.y; - output[batch * hidden + local_idx] = - slice_input[cu_seqlens[batch] * hidden + local_idx]; + output[batch_id * hidden + hidden_id] = + slice_input[cu_seqlens[batch_id] * hidden + hidden_id]; } int SpecialSlicePluginDynamic::enqueue( @@ -137,15 +137,16 @@ int SpecialSlicePluginDynamic::enqueue( "hidden should be multiple of 128.")); constexpr int num_threads = 128; - const dim3 blocks(out_dims.d[0], hidden / num_threads); - const half* slice_input = static_cast(inputs[0]); const int32_t* cu_seqlens = static_cast(inputs[1]); half* output = static_cast(outputs[0]); - SpecialSliceKernel<<>>(slice_input, - cu_seqlens, output); + const int32_t num_blocks_x = hidden / num_threads; + const int32_t num_blocks_y = out_dims.d[0]; // batchs + const dim3 num_blocks(num_blocks_x, num_blocks_y); // blocks + SpecialSliceKernel<<>>( + slice_input, cu_seqlens, output); return cudaGetLastError() != cudaSuccess; } From e0409c93c0b4b5f063421eac32c26be5b83de012 Mon Sep 17 00:00:00 2001 From: Allen Guo Date: Thu, 24 Feb 2022 20:43:50 +0800 Subject: [PATCH 39/85] [IPU] Update IpuStrategy Python Part (#39646) * Update IpuStrategy Python Part * add docs * add add_custom_op for ipu_strategy * fix build warning * rm unneeded part * clean api * fix typo * update option names * update IpuStrategy doc --- paddle/fluid/pybind/pybind.cc | 214 ++++++++++++-------- paddle/fluid/pybind/tensor_py.h | 10 +- python/paddle/fluid/compiler.py | 337 ++++++++++++++++++++------------ python/paddle/fluid/executor.py | 3 - 4 files changed, 351 insertions(+), 213 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 958174420570e..1ea9c7c65d5f5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -3786,86 +3786,142 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_IPU py::class_>(m, "IpuBackend") - .def(py::init(&platform::ipu::IpuBackend::GetNewInstance)) - .def("clear", &platform::ipu::IpuBackend::Clear) + std::unique_ptr>( + m, "IpuBackend") + // manage IpuBackend in C++ + .def("get_instance", + []() { + return std::unique_ptr( + platform::ipu::IpuBackend::GetInstance()); + }, + py::return_value_policy::reference) + .def("detach", &platform::ipu::IpuBackend::Detach) + .def("reset", &platform::ipu::IpuBackend::Reset) .def("set_scope", &platform::ipu::IpuBackend::SetScope) - .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy); - - py::class_ ipu_strategy(m, "IpuStrategy"); - ipu_strategy.def(py::init()) - .def_property( - "num_ipus", - [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; }, - [](platform::ipu::IpuStrategy &self, int num_ipus) { - self.num_ipus = num_ipus; - }) - .def_property( - "accumulationFactor", - [](const platform::ipu::IpuStrategy &self) { - return self.popart_options_.accumulationFactor; - }, - [](platform::ipu::IpuStrategy &self, int accumulationFactor) { - self.popart_options_.accumulationFactor = accumulationFactor; - }) - .def_property("batches_per_step", - [](const platform::ipu::IpuStrategy &self) { - return self.batches_per_step; - }, - [](platform::ipu::IpuStrategy &self, int batches_per_step) { - self.batches_per_step = batches_per_step; - }) - .def_property("is_training", - [](const platform::ipu::IpuStrategy &self) { - return self.is_training; - }, - [](platform::ipu::IpuStrategy &self, bool is_training) { - self.is_training = is_training; - }) - .def_property( - "enable_pipelining", - [](const platform::ipu::IpuStrategy &self) { - return self.popart_options_.enablePipelining; - }, - [](platform::ipu::IpuStrategy &self, bool enable_pipelining) { - self.popart_options_.enablePipelining = enable_pipelining; - }) - .def_property( - "enable_manual_shard", - [](const platform::ipu::IpuStrategy &self) { - return self.popart_options_.virtualGraphMode == - platform::ipu::VirtualGraphMode::Manual; - }, - [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) { - if (enable_ipu_shard) { - self.popart_options_.virtualGraphMode = - platform::ipu::VirtualGraphMode::Manual; - } else { - self.popart_options_.virtualGraphMode = - platform::ipu::VirtualGraphMode::Off; - } - }) - .def_property("need_avg_shard", - [](const platform::ipu::IpuStrategy &self) { - return self.need_avg_shard; - }, - [](platform::ipu::IpuStrategy &self, bool need_avg_shard) { - self.need_avg_shard = need_avg_shard; - }) - .def_property("batch_size", - [](const platform::ipu::IpuStrategy &self) { - return self.batch_size; - }, - [](platform::ipu::IpuStrategy &self, int batch_size) { - self.batch_size = batch_size; - }) - .def_property("enable_fp16", - [](const platform::ipu::IpuStrategy &self) { - return self.enable_fp16; - }, - [](platform::ipu::IpuStrategy &self, bool enable_fp16) { - self.enable_fp16 = enable_fp16; - }); + .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy) + .def("save_model_proto", &platform::ipu::IpuBackend::SaveModelProto); + + py::class_(m, "IpuStrategy") + .def(py::init()) + .def("set_options", + [](platform::ipu::IpuStrategy &self, const py::dict &opt) { + for (auto element : opt) { + auto option_name = element.first.cast(); + VLOG(10) << "Set option: " << option_name; + if (py::isinstance(element.second)) { + self.AddBoolOption(option_name, element.second.cast()); + } else if (py::isinstance(element.second)) { + self.AddDoubleOption(option_name, + element.second.cast()); + } else if (py::isinstance(element.second)) { + self.AddUint64Option(option_name, + element.second.cast()); + } else if (py::isinstance(element.second)) { + self.AddStringOption(option_name, + element.second.cast()); + } else if (py::isinstance(element.second) || + py::isinstance(element.second)) { + for (auto option : element.second.cast()) { + std::string option_val; + if (py::isinstance(option)) { + option_val = option.cast(); + } else if (py::isinstance(option)) { + option_val = std::to_string(option.cast()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Failed to convert type: %s when set IpuStrategy " + "option: %s", + option.get_type(), option_name)); + } + self.InsertStringOption(option_name, option_val); + } + } else if (py::isinstance(element.second)) { + if (option_name.rfind("location_", 0) == 0) { + for (auto option : element.second.cast()) { + self.SetTensorLocation( + option_name, option.first.cast(), + option.second.cast()); + } + } else if (option_name == "custom_op") { + std::string paddle_op; + std::string popart_op; + std::string domain; + int version = -1; + for (auto option : element.second.cast()) { + std::string option_key = option.first.cast(); + if (option_key == "paddle_op") { + paddle_op = option.second.cast(); + } else if (option_key == "popart_op") { + popart_op = option.second.cast(); + } else if (option_key == "domain") { + domain = option.second.cast(); + } else if (option_key == "version") { + version = option.second.cast(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid argument, key must be one of paddle_op, " + "popart_op, domain or version, but revecived %s", + option_key)); + } + } + self.AddCustomOp(paddle_op, popart_op, domain, version); + } else { + for (auto option : element.second.cast()) { + std::string option_key = option.first.cast(); + std::string option_val; + if (py::isinstance(option.second)) { + option_val = option.second.cast(); + } else if (py::isinstance(option.second)) { + option_val = + std::to_string(option.second.cast()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Failed to convert value type: %s when set " + "IpuStrategy option: %s", + option.second.get_type(), option_key)); + } + self.InsertStringPairOption(option_name, option_key, + option_val); + } + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid IpuStrategy option value type: %s, please check " + "input value for option: %s", + element.second.get_type(), option_name)); + } + } + }) + .def("get_option", + [](platform::ipu::IpuStrategy &self, const std::string &name) { + py::dict res; + auto option_type = self.GetOptionType(name); + res["name"] = name; + res["type"] = option_type; + if (option_type == "vector") { + auto value = self.GetVectorOption(name); + res["value"] = value; + } else if (option_type == "map") { + auto value = self.GetMapOption(name); + res["value"] = value; + } else { + auto value_s = self.GetOption(name); + res["value_s"] = value_s; + if (option_type == "bool") { + res["value"] = static_cast(std::stoi(value_s)); + } else if (option_type == "uint64") { + res["value"] = std::stoul(value_s); + } else if (option_type == "double") { + res["value"] = std::stod(value_s); + } else if (option_type == "string") { + res["value"] = value_s; + } + } + return res; + }) + .def("enable_pattern", &platform::ipu::IpuStrategy::EnablePattern) + .def("disable_pattern", &platform::ipu::IpuStrategy::DisablePattern) + .def("is_pattern_enabled", &platform::ipu::IpuStrategy::IsPatternEnabled); #endif BindFleetWrapper(&m); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 531cc03f26714..49bacc1cd6d85 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -350,8 +350,14 @@ void SetTensorFromPyArrayT( auto type = framework::ToDataType(std::type_index(typeid(T))); self->ResetHolderWithType(holder, framework::TransToPtenDataType(type)); } else { - auto dst = self->mutable_data(place); - std::memcpy(dst, array.data(), array.nbytes()); + // IPU does not store Tensor data, Tensor will be created on CPU + if (!self->initialized()) { + auto dst = self->mutable_data(place); + std::memcpy(dst, array.data(), array.nbytes()); + } else { + auto dst = self->mutable_data(self->place()); + std::memcpy(dst, array.data(), array.nbytes()); + } } #else PADDLE_THROW(platform::errors::PermissionDenied( diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 7e3dfde5d4f67..b8a696057e780 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -502,9 +502,6 @@ class IpuStrategy(object): """ Help users precisely control the graph building in :code:`paddle.static.IpuCompiledProgram` . - Args: - None. - Returns: The IpuStrategy instance. @@ -517,23 +514,36 @@ class IpuStrategy(object): import paddle.static as static paddle.enable_static() + ipu_strategy = static.IpuStrategy() """ def __init__(self): if core.is_compiled_with_ipu(): self._ipu_strategy = core.IpuStrategy() + default_options = { + 'location_optimizer': { + 'on_chip': 0, + 'use_replicated_tensor_sharding': 1, + }, # set optimizer location + 'accumulation_and_replication_reduction_type': + 1, # popart::ReductionType::Mean + 'mean_accumulation_and_replication_reduction_strategy': + 1, # popart::MeanReductionStrategy::Post + } + self._ipu_strategy.set_options(default_options) + self.has_custom_ops = False + self.custom_op_names = [] else: raise RuntimeError( "Can not use IpuStrategy in non IPU compiled environment, please re-compile with WITH_IPU=ON." ) - def SetGraphConfig(self, - num_ipus=1, - is_training=True, - batch_size=1, - enable_manual_shard=False, - need_avg_shard=False): + def set_graph_config(self, + num_ipus=1, + is_training=True, + batch_size=1, + enable_manual_shard=False): """ Set graph configuration to the IpuStrategy instance. @@ -544,8 +554,6 @@ def SetGraphConfig(self, if the batch-size in the graph is dynamic. Default 1, which means the batch-size would be set 1, if the batch-size is dynamice. enable_manual_shard (bool, optional): Enable graph sharding or not. Only if num_ipus > 1, enable_manual_shard is able to be set True. Default False, which means disabled. - need_avg_shard (bool, optional): Enable auto graph sharding or not. Only if num_ipus > 1 and enable_manual_shard=True, need_avg_shard is able to be set Trues. - Default False, which means disabled. Returns: None. @@ -559,32 +567,29 @@ def SetGraphConfig(self, import paddle.static as static paddle.enable_static() + ipu_strategy = static.IpuStrategy() - ipu_strategy.SetGraphConfig(num_ipus=1, + ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1, - enable_manual_shard=False, - need_avg_shard=False) + enable_manual_shard=False) """ - - self._ipu_strategy.num_ipus = num_ipus - self._ipu_strategy.is_training = is_training - self._ipu_strategy.batch_size = batch_size - self._ipu_strategy.enable_manual_shard = enable_manual_shard - if self._ipu_strategy.num_ipus == 1 and self._ipu_strategy.enable_manual_shard: + if num_ipus == 1 and enable_manual_shard: raise RuntimeError( "Only if num_ipus > 1, enable_manual_shard is able to be set True." ) - self._ipu_strategy.need_avg_shard = need_avg_shard - if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.need_avg_shard: - raise RuntimeError( - "Only if enable_manual_shard=True, need_avg_shard is able to be set True." - ) - - def SetPipeliningConfig(self, - enable_pipelining=False, - batches_per_step=1, - accumulationFactor=1): + options = { + 'num_ipus': num_ipus, + 'is_training': is_training, + 'micro_batch_size': batch_size, + 'enable_manual_shard': enable_manual_shard, + } + self.set_options(options) + + def set_pipelining_config(self, + enable_pipelining=False, + batches_per_step=1, + accumulation_factor=1): """ Set pipelining configuration to the IpuStrategy instance. Used to optimize the throughput performance. @@ -593,7 +598,7 @@ def SetPipeliningConfig(self, Default False, which means disabled. batches_per_step (int, optional): Set the batches per run in data pipelining mode. Only if enable_pipelining=True, batches_per_step is able to be set > 1. Default 1, which means no data pipelining. - accumulationFactor (int, optional): Specify the number of micro-batches to accumulate + accumulation_factor (int, optional): Specify the number of micro-batches to accumulate before applying the varUpdate. Default 1, which means disable the accumulation. Returns: @@ -610,23 +615,23 @@ def SetPipeliningConfig(self, paddle.enable_static() ipu_strategy = static.IpuStrategy() - ipu_strategy.SetPipeliningConfig(enable_pipelining=False, - batches_per_step=1, - accumulationFactor=1) + ipu_strategy.set_pipelining_config(enable_pipelining=False, + batches_per_step=1, + accumulation_factor=1) """ - self._ipu_strategy.enable_pipelining = enable_pipelining - if self._ipu_strategy.enable_manual_shard != True and self._ipu_strategy.enable_pipelining: + enable_manual_shard = self.get_option('enable_manual_shard') + if not enable_manual_shard and enable_pipelining: raise RuntimeError( "Only if enable_manual_shard=True, enable_pipelining is able to be set True." ) - self._ipu_strategy.batches_per_step = batches_per_step - if self._ipu_strategy.enable_pipelining != True and self._ipu_strategy.batches_per_step > 1: - raise RuntimeError( - "Only if enable_pipelining=True, batches_per_step is able to be set > 1." - ) - self._ipu_strategy.accumulationFactor = accumulationFactor - - def SetHalfConfig(self, enable_fp16=False): + options = { + 'enable_pipelining': enable_pipelining, + 'batches_per_step': batches_per_step, + 'accumulation_factor': accumulation_factor, + } + self.set_options(options) + + def set_precision_config(self, enable_fp16=False): """ Set half computation configuration to the IpuStrategy instance. Used to optimize the performance. @@ -647,73 +652,135 @@ def SetHalfConfig(self, enable_fp16=False): paddle.enable_static() ipu_strategy = static.IpuStrategy() - ipu_strategy.SetHalfConfig(enable_fp16=False) + ipu_strategy.set_precision_config(enable_fp16=False) + """ + options = {'enable_fp16': enable_fp16, } + self.set_options(options) + + def add_custom_op(self, + paddle_op, + popart_op=None, + domain='custom.ops', + version=1): """ + Add a mapping to use popart custom ops running on the IPU. - self._ipu_strategy.enable_fp16 = enable_fp16 + Args: + paddle_op(str): the name of custom op in paddle. - @property - def num_ipus(self): - """ - Get the number of IPU devices from IpuStrategy instance. - """ - return self._ipu_strategy.num_ipus + popart_op(str): the name of custom op in popart. - @property - def is_training(self): - """ - Get the boolean of training or inference from IpuStrategy instance. - """ - return self._ipu_strategy.is_training + domain(str): domain name of custom op in popart. - @property - def batch_size(self): + version(int): version of custom op in popart. + + Returns: + None. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + paddle.enable_static() + + ipu_strategy = static.IpuStrategy() + ipu_strategy.add_custom_op('paddle_relu', 'popart_relu') """ - Get the batch_size used in dynamic batch_size graph from IpuStrategy instance. + if popart_op is None: + popart_op = paddle_op + custom_op = { + 'paddle_op': paddle_op, + 'popart_op': popart_op, + 'domain': domain, + 'version': version, + } + self.set_options({'custom_op': custom_op}) + self.custom_op_names.append(paddle_op) + if not self.has_custom_ops: + self.has_custom_ops = True + + def set_options(self, options): """ - return self._ipu_strategy.batch_size + Set options from dict. - @property - def enable_manual_shard(self): - """ - Get the boolean of enable manual shard or not from IpuStrategy instance. + Args: + options(dict): dict of options. + + Returns: + None. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + paddle.enable_static() + + ipu_strategy = static.IpuStrategy() + options = {'num_ipus':1, 'enable_fp16': True} + ipu_strategy.set_options(options) """ - return self._ipu_strategy.enable_manual_shard + self._ipu_strategy.set_options(options) - @property - def need_avg_shard(self): + def get_option(self, option): """ - Get the boolean of need average shard or not from IpuStrategy instance. + Get option. + + Args: + option(str): name of option. + + Returns: + option value. + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + import paddle.static as static + + paddle.enable_static() + + ipu_strategy = static.IpuStrategy() + num_ipus = ipu_strategy.get_option('num_ipus') """ - return self._ipu_strategy.need_avg_shard + return self._ipu_strategy.get_option(option)['value'] @property - def enable_pipelining(self): + def num_ipus(self): """ - Get the boolean of enable pipelining or not from IpuStrategy instance. + Get the number of IPU devices from IpuStrategy instance. """ - return self._ipu_strategy.enable_pipelining + return self.get_option('num_ipus') @property - def batches_per_step(self): + def is_training(self): """ - Get the number of batch_size per run in the pipelining mode from IpuStrategy instance. + Get the boolean of training or inference from IpuStrategy instance. """ - return self._ipu_strategy.batches_per_step + return self.get_option('is_training') @property - def accumulationFactor(self): + def enable_pipelining(self): """ - Get the number of micro-batches to accumulate before applying the varUpdate from IpuStrategy instance. + Get the boolean of enable pipelining or not from IpuStrategy instance. """ - return self._ipu_strategy.accumulationFactor + return self.get_option('enable_pipelining') @property def enable_fp16(self): """ Get the boolean of float16 mode or not from IpuStrategy instance. """ - return self._ipu_strategy.enable_fp16 + return self.get_option('enable_fp16') class IpuCompiledProgram(object): @@ -750,9 +817,9 @@ class IpuCompiledProgram(object): main_prog = static.default_main_program() ipu_strategy = static.IpuStrategy() - ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1) - ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1) - ipu_strategy.SetHalfConfig(enable_fp16=False) + ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) + ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) + ipu_strategy.set_precision_config(enable_fp16=False) ipu_compiled_program = static.IpuCompiledProgram( main_prog, @@ -766,14 +833,12 @@ def __init__(self, program=None, scope=None, ipu_strategy=None): ) if program is None: - program = default_main_program() + program = framework.default_main_program() if not isinstance(program, framework.Program): raise TypeError( "The type of program is wrong, expected Program, but got %s" % type(program)) - # import here to avoiding confused - import paddle self._program = program self._compiled = False @@ -781,23 +846,21 @@ def __init__(self, program=None, scope=None, ipu_strategy=None): if scope is not None: self._scope = scope else: + # import here to avoiding confused + import paddle self._scope = paddle.static.global_scope() if ipu_strategy is not None: - self._ipu_strategy = ipu_strategy._ipu_strategy + self._ipu_strategy = ipu_strategy else: - self._ipu_strategy = core.IpuStrategy() + self._ipu_strategy = IpuStrategy() - self._backend = core.IpuBackend() - self._backend.set_scope(self._scope) - self._backend.set_ipu_strategy(self._ipu_strategy) - self._graph_passes = [ - "optimizer_extract_pass", "optimizer_state_align_pass", - "forward_graph_extract_pass", "infer_shape_pass", "avg_shard_pass", - "popart_canonicalization_pass" - ] - global ipu_compiler_ref - ipu_compiler_ref = self + if ipu_strategy.has_custom_ops: + self._custom_op_names = set(ipu_strategy.custom_op_names) + else: + self._custom_op_names = () + + self._backend = core.IpuBackend.get_instance() def compile(self, feed_list, fetch_list): """ @@ -828,20 +891,23 @@ def compile(self, feed_list, fetch_list): main_prog = static.default_main_program() ipu_strategy = static.IpuStrategy() - ipu_strategy.SetGraphConfig(num_ipus=1, is_training=True, batch_size=1) - ipu_strategy.SetPipeliningConfig(enable_pipelining=False, batches_per_step=1, accumulationFactor=1) - ipu_strategy.SetHalfConfig(enable_fp16=False) + ipu_strategy.set_graph_config(num_ipus=1, is_training=True, batch_size=1) + ipu_strategy.set_pipelining_config(enable_pipelining=False, batches_per_step=1, accumulation_factor=1) + ipu_strategy.set_precision_config(enable_fp16=False) program = static.IpuCompiledProgram( main_prog, ipu_strategy=ipu_strategy).compile([a.name], [b.name]) """ + self._backend.set_scope(self._scope) + self._backend.set_ipu_strategy(self._ipu_strategy._ipu_strategy) + # feed and fetch doesn't have corresponding popart op, so we rm both here global_block = self._program.global_block() need_to_remove_op_index = [] for i, op in enumerate(global_block.ops): op.desc.set_is_target(False) - if op.type == "feed" or op.type == "fetch": + if op.type == 'feed' or op.type == 'fetch': need_to_remove_op_index.append(i) for index in need_to_remove_op_index[::-1]: @@ -854,26 +920,45 @@ def compile(self, feed_list, fetch_list): self._program.desc.flush() self._graph = core.Graph(self._program.desc) - for pass_name in self._graph_passes: - graph_pass = core.get_pass(pass_name) - if pass_name == "infer_shape_pass": - graph_pass.set("feed_list", feed_list) - graph_pass.apply(self._graph) - - ipu_inplace_pass = core.get_pass("ipu_inplace_pass") - ipu_inplace_pass.set("feed_list", feed_list) - ipu_inplace_pass.set("fetch_list", fetch_list) - ipu_inplace_pass.apply(self._graph) - - ipu_graph_builder_pass = core.get_pass("ipu_graph_builder_pass") - ipu_graph_builder_pass.set("feed_list", feed_list) - ipu_graph_builder_pass.set("fetch_list", fetch_list) - ipu_graph_builder_pass.apply(self._graph) - - ipu_runtime_replacer_pass = core.get_pass("ipu_runtime_replacer_pass") - ipu_runtime_replacer_pass.set("feed_list", feed_list) - ipu_runtime_replacer_pass.set("fetch_list", fetch_list) - ipu_runtime_replacer_pass.apply(self._graph) + if self._ipu_strategy.is_training: + passes = [ + 'optimizer_extract_pass', + 'optimizer_state_align_pass', + ] + for pass_name in passes: + a_pass = core.get_pass(pass_name) + a_pass.apply(self._graph) + + passes = [ + 'forward_graph_extract_pass', + 'infer_shape_pass', + 'avg_shard_pass', + 'delete_scale_op_pass', + ] + for pass_name in passes: + a_pass = core.get_pass(pass_name) + if pass_name == 'infer_shape_pass': + a_pass.set('feed_list', feed_list) + a_pass.apply(self._graph) + + a_pass = core.get_pass('popart_canonicalization_pass') + if self._custom_op_names: + a_pass.set('custom_ops', self._custom_op_names) + a_pass.apply(self._graph) + + a_pass = core.get_pass("transfer_cast_op_pass") + a_pass.apply(self._graph) + + passes = [ + 'ipu_inplace_pass', + 'ipu_graph_builder_pass', + 'ipu_runtime_replacer_pass', + ] + for pass_name in passes: + a_pass = core.get_pass(pass_name) + a_pass.set('feed_list', feed_list) + a_pass.set('fetch_list', fetch_list) + a_pass.apply(self._graph) convert_pass = core.get_pass('graph_to_program_pass') desc = core.ProgramDesc() @@ -904,9 +989,3 @@ def compile(self, feed_list, fetch_list): program.org_program = self._program return program - - def clean(self): - self._backend.clear() - - def __del__(self): - self.clean() diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 447d6457e0a3c..e372727b0f0b6 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -1583,9 +1583,6 @@ def _run_program(self, program, feed, fetch_list, feed_var_name, lr_sheduler = program.lr_sheduler lr_value = lr_sheduler() lr_var = program.global_block().vars[lr_sheduler._var_name] - if core.is_compiled_with_ipu(): - if hasattr(program.lr_sheduler, 'lr_var'): - lr_var = program.lr_sheduler.lr_var data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype)) tensor = core.get_variable_tensor(scope, lr_sheduler._var_name) tensor.set(data, self.place) From f77019a0ad0b967a753bf6ef11f9e3884d5c4dc4 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Thu, 24 Feb 2022 21:34:40 +0800 Subject: [PATCH 40/85] [PTen->Phi PR3] Rename pten make target to phi (#39832) * rename pten to phi * fix infrt compile failed * resolve conflict --- cmake/generic.cmake | 20 ++++----- cmake/inference_lib.cmake | 10 ++--- cmake/{pten.cmake => phi.cmake} | 10 ++--- cmake/{pten_header.cmake => phi_header.cmake} | 16 +++---- .../distributed/collective/CMakeLists.txt | 4 +- paddle/fluid/eager/CMakeLists.txt | 8 ++-- .../fluid/eager/accumulation/CMakeLists.txt | 2 +- .../eager_generated/backwards/CMakeLists.txt | 2 +- .../eager_generated/forwards/CMakeLists.txt | 2 +- paddle/fluid/eager/api/utils/CMakeLists.txt | 4 +- paddle/fluid/framework/CMakeLists.txt | 16 +++---- paddle/fluid/imperative/CMakeLists.txt | 12 ++--- paddle/fluid/imperative/tests/CMakeLists.txt | 2 +- paddle/fluid/inference/CMakeLists.txt | 10 ++--- paddle/fluid/operators/CMakeLists.txt | 2 +- paddle/fluid/platform/CMakeLists.txt | 8 ++-- .../fluid/platform/device/gpu/CMakeLists.txt | 4 +- .../platform/device/gpu/cuda/CMakeLists.txt | 2 +- .../fluid/platform/device/xpu/CMakeLists.txt | 2 +- paddle/fluid/platform/dynload/CMakeLists.txt | 18 ++++---- paddle/fluid/pybind/CMakeLists.txt | 4 +- paddle/infrt/CMakeLists.txt | 3 +- paddle/infrt/kernel/phi/CMakeLists.txt | 2 +- paddle/phi/CMakeLists.txt | 18 ++++---- paddle/phi/api/CMakeLists.txt | 2 +- paddle/phi/api/lib/CMakeLists.txt | 26 +++++------ paddle/phi/api/lib/utils/CMakeLists.txt | 2 +- paddle/phi/backends/CMakeLists.txt | 8 ++-- paddle/phi/backends/cpu/CMakeLists.txt | 4 +- paddle/phi/backends/custom/CMakeLists.txt | 2 +- paddle/phi/backends/dynload/CMakeLists.txt | 22 +++++----- paddle/phi/backends/gpu/CMakeLists.txt | 6 +-- paddle/phi/backends/gpu/cuda/CMakeLists.txt | 2 +- paddle/phi/backends/gpu/rocm/CMakeLists.txt | 2 +- paddle/phi/backends/xpu/CMakeLists.txt | 4 +- paddle/phi/common/CMakeLists.txt | 2 +- paddle/phi/core/CMakeLists.txt | 24 +++++----- paddle/phi/core/compat/CMakeLists.txt | 8 ++-- paddle/phi/kernels/CMakeLists.txt | 10 ++--- paddle/phi/tests/CMakeLists.txt | 2 +- paddle/phi/tests/api/CMakeLists.txt | 44 +++++++++---------- paddle/phi/tests/common/CMakeLists.txt | 8 ++-- paddle/phi/tests/core/CMakeLists.txt | 4 +- paddle/phi/tests/kernels/CMakeLists.txt | 30 ++++++------- .../{ops_signature => ops}/CMakeLists.txt | 0 .../test_op_signature.cc | 2 +- .../test_op_signature.h | 0 paddle/phi/tools/CMakeLists.txt | 2 +- 48 files changed, 199 insertions(+), 198 deletions(-) rename cmake/{pten.cmake => phi.cmake} (97%) rename cmake/{pten_header.cmake => phi_header.cmake} (69%) rename paddle/phi/tests/{ops_signature => ops}/CMakeLists.txt (100%) rename paddle/phi/tests/{ops_signature => ops}/test_op_signature.cc (98%) rename paddle/phi/tests/{ops_signature => ops}/test_op_signature.h (100%) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 847073fb7b57c..f7c17bd7cfe7e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -116,19 +116,19 @@ function(find_fluid_modules TARGET_NAME) endif() endfunction(find_fluid_modules) -set_property(GLOBAL PROPERTY PTEN_MODULES "") -# find all pten modules is used for paddle static library +set_property(GLOBAL PROPERTY PHI_MODULES "") +# find all phi modules is used for paddle static library # for building inference libs -function(find_pten_modules TARGET_NAME) +function(find_phi_modules TARGET_NAME) get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path}) string(FIND "${__target_path}" "phi" pos) if(pos GREATER 1) - get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES) - set(pten_modules ${pten_modules} ${TARGET_NAME}) - set_property(GLOBAL PROPERTY PTEN_MODULES "${pten_modules}") + get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) + set(phi_modules ${phi_modules} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY PHI_MODULES "${phi_modules}") endif() -endfunction(find_pten_modules) +endfunction(find_phi_modules) function(common_link TARGET_NAME) if (WITH_PROFILER) @@ -324,7 +324,7 @@ function(cc_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) find_fluid_modules(${TARGET_NAME}) - find_pten_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if(cc_library_DEPS) # Don't need link libwarpctc.so @@ -497,7 +497,7 @@ function(nv_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) find_fluid_modules(${TARGET_NAME}) - find_pten_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (nv_library_DEPS) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) @@ -588,7 +588,7 @@ function(hip_library TARGET_NAME) else() hip_add_library(${TARGET_NAME} STATIC ${hip_library_SRCS}) find_fluid_modules(${TARGET_NAME}) - find_pten_modules(${TARGET_NAME}) + find_phi_modules(${TARGET_NAME}) endif() if (hip_library_DEPS) add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index b8d1f4eb116a9..c48d31f7e4f90 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -224,7 +224,7 @@ copy(inference_lib_dist DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/crypto/) include_directories(${CMAKE_BINARY_DIR}/../paddle/fluid/framework/io) -# copy api headers for pten & custom op +# copy api headers for phi & custom op copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/phi/api/ext/*.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext/) @@ -244,11 +244,11 @@ copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) -# the header file of pten is copied to the experimental directory, -# the include path of pten needs to be changed to adapt to inference api path +# the header file of phi is copied to the experimental directory, +# the include path of phi needs to be changed to adapt to inference api path add_custom_command(TARGET inference_lib_dist POST_BUILD - COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/pten_header.cmake" - COMMENT "Change pten header include path to adapt to inference api path") + COMMAND ${CMAKE_COMMAND} -P "${PADDLE_SOURCE_DIR}/cmake/phi_header.cmake" + COMMENT "Change phi header include path to adapt to inference api path") # CAPI inference library for only inference set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING diff --git a/cmake/pten.cmake b/cmake/phi.cmake similarity index 97% rename from cmake/pten.cmake rename to cmake/phi.cmake index 5645ac6cfa303..f1a6f8e45a74c 100644 --- a/cmake/pten.cmake +++ b/cmake/phi.cmake @@ -51,7 +51,7 @@ function(generate_unify_header DIR_NAME) endforeach() # append header into extension.h string(REPLACE "${PADDLE_SOURCE_DIR}\/" "" header_file "${header_file}") - file(APPEND ${pten_extension_header_file} "#include \"${header_file}\"\n") + file(APPEND ${phi_extension_header_file} "#include \"${header_file}\"\n") endfunction() # call kernel_declare need to make sure whether the target of input exists @@ -240,10 +240,10 @@ function(kernel_library TARGET) if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) - # append target into PTEN_KERNELS property - get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) - set(pten_kernels ${pten_kernels} ${TARGET}) - set_property(GLOBAL PROPERTY PTEN_KERNELS ${pten_kernels}) + # append target into PHI_KERNELS property + get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) + set(phi_kernels ${phi_kernels} ${TARGET}) + set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) endif() # parse kernel name and auto generate kernel declaration diff --git a/cmake/pten_header.cmake b/cmake/phi_header.cmake similarity index 69% rename from cmake/pten_header.cmake rename to cmake/phi_header.cmake index 6341aca9ec739..c9b7e465337dd 100644 --- a/cmake/pten_header.cmake +++ b/cmake/phi_header.cmake @@ -14,8 +14,8 @@ set(PADDLE_INFERENCE_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_install_dir") -function(pten_header_path_compat TARGET_PATH) -message(STATUS "pten header path compat processing: ${TARGET_PATH}") +function(phi_header_path_compat TARGET_PATH) +message(STATUS "phi header path compat processing: ${TARGET_PATH}") string(FIND ${TARGET_PATH} "experimental" pos) if (pos GREATER 1) file(GLOB HEADERS "${TARGET_PATH}/*" "*.h") @@ -25,17 +25,17 @@ if (pos GREATER 1) string(REPLACE "paddle/phi/" "paddle/include/experimental/phi/" HEADER_CONTENT "${HEADER_CONTENT}") string(REPLACE "paddle/utils/" "paddle/include/experimental/utils/" HEADER_CONTENT "${HEADER_CONTENT}") file(WRITE ${header} "${HEADER_CONTENT}") - message(STATUS "pten header path compat processing complete: ${header}") + message(STATUS "phi header path compat processing complete: ${header}") endif() endforeach() endif() endfunction() -pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental) -pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api) -pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext) -pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include) -pten_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common) +phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental) +phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api) +phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/ext) +phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/api/include) +phi_header_path_compat(${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/phi/common) # In order to be compatible with the original behavior, the header file name needs to be changed file(RENAME ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/extension.h diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt index 5daaf29ae2895..41652f8b6ed6f 100644 --- a/paddle/fluid/distributed/collective/CMakeLists.txt +++ b/paddle/fluid/distributed/collective/CMakeLists.txt @@ -1,5 +1,5 @@ -cc_library(processgroup SRCS ProcessGroup.cc DEPS pten pten_api eager_api) +cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api) if(WITH_NCCL) - cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context pten pten_api eager_api) + cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api) endif() diff --git a/paddle/fluid/eager/CMakeLists.txt b/paddle/fluid/eager/CMakeLists.txt index 711c46e995286..5e16ab2b391d0 100644 --- a/paddle/fluid/eager/CMakeLists.txt +++ b/paddle/fluid/eager/CMakeLists.txt @@ -1,4 +1,4 @@ -set(eager_deps pten pten_api hook_utils tensor_utils utils global_utils backward pten_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) +set(eager_deps phi phi_api hook_utils tensor_utils utils global_utils backward phi_tensor tracer layer autograd_meta grad_node_info grad_tensor_holder accumulation_node) set(fluid_deps tracer layer proto_desc operator op_registry variable_helper memcpy) set(generated_deps dygraph_function dygraph_node) @@ -10,11 +10,11 @@ endif() add_subdirectory(api) add_subdirectory(accumulation) -cc_library(grad_node_info SRCS grad_node_info.cc DEPS pten pten_api) +cc_library(grad_node_info SRCS grad_node_info.cc DEPS phi phi_api) cc_library(grad_tensor_holder SRCS grad_tensor_holder.cc DEPS grad_node_info gradient_accumulator) -cc_library(autograd_meta SRCS autograd_meta.cc DEPS pten pten_api) -cc_library(utils SRCS utils.cc DEPS pten pten_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) +cc_library(autograd_meta SRCS autograd_meta.cc DEPS phi phi_api) +cc_library(utils SRCS utils.cc DEPS phi phi_api global_utils layer proto_desc operator op_registry variable_helper memcpy scale_op autograd_meta hook_utils) cc_library(backward SRCS backward.cc DEPS grad_tensor_holder utils autograd_meta grad_node_info) add_subdirectory(tests) diff --git a/paddle/fluid/eager/accumulation/CMakeLists.txt b/paddle/fluid/eager/accumulation/CMakeLists.txt index 632e289ba2308..43ca707f4f6fb 100644 --- a/paddle/fluid/eager/accumulation/CMakeLists.txt +++ b/paddle/fluid/eager/accumulation/CMakeLists.txt @@ -1 +1 @@ -cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator pten pten_api grad_node_info) +cc_library(accumulation_node SRCS accumulation_node.cc DEPS gradient_accumulator phi phi_api grad_node_info) diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt index e3fafb265ad98..77d8ec57efcaa 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(scale_node SRCS scale_node.cc DEPS global_utils pten pten_api grad_node_info) +cc_library(scale_node SRCS scale_node.cc DEPS global_utils phi phi_api grad_node_info) if(NOT ON_INFER) cc_library(final_dygraph_node SRCS nodes.cc DEPS ${eager_deps}) diff --git a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt index 8ede139ddc044..60b35340eabd1 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt +++ b/paddle/fluid/eager/api/generated/eager_generated/forwards/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(eager_scale SRCS scale.cc DEPS pten_api pten autograd_meta scale_node) +cc_library(eager_scale SRCS scale.cc DEPS phi_api phi autograd_meta scale_node) if(NOT ON_INFER) cc_library(final_dygraph_function SRCS dygraph_functions.cc DEPS ${eager_deps}) diff --git a/paddle/fluid/eager/api/utils/CMakeLists.txt b/paddle/fluid/eager/api/utils/CMakeLists.txt index 3a4f0ba320358..c34df3972c23e 100644 --- a/paddle/fluid/eager/api/utils/CMakeLists.txt +++ b/paddle/fluid/eager/api/utils/CMakeLists.txt @@ -1,3 +1,3 @@ -cc_library(tensor_utils SRCS tensor_utils.cc DEPS pten pten_api autograd_meta grad_node_info accumulation_node) -cc_library(hook_utils SRCS hook_utils.cc DEPS pten tensor_utils autograd_meta grad_node_info utils accumulation_node) +cc_library(tensor_utils SRCS tensor_utils.cc DEPS phi phi_api autograd_meta grad_node_info accumulation_node) +cc_library(hook_utils SRCS hook_utils.cc DEPS phi tensor_utils autograd_meta grad_node_info utils accumulation_node) cc_library(global_utils SRCS global_utils.cc DEPS place tracer) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 7d527e24a0079..082c508174332 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -193,19 +193,19 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) IF(WITH_XPU) -cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info xpu_op_list) +cc_library(phi_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list) ELSE() -cc_library(pten_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place pten var_type_traits pten_api_utils op_info) +cc_library(phi_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info) ENDIF() IF(WITH_XPU) cc_library(operator SRCS operator.cc DEPS xpu_op_list op_info device_context tensor scope glog trainer_desc_proto data_feed_proto shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - pten pten_utils kernel_factory infershape_utils op_utils) + phi phi_utils kernel_factory infershape_utils op_utils) ELSE() cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog trainer_desc_proto data_feed_proto shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type op_call_stack unused_var_check nan_inf_utils - pten pten_utils kernel_factory infershape_utils op_utils) + phi phi_utils kernel_factory infershape_utils op_utils) ENDIF() cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -412,7 +412,7 @@ cc_library(save_load_util SRCS save_load_util.cc DEPS tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_library(generator SRCS generator.cc DEPS enforce place) -cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place pten var_type_traits pten pten_api_utils op_info shape_inference) +cc_library(infershape_utils SRCS infershape_utils.cc DEPS lod_tensor selected_rows_utils attribute place phi var_type_traits phi phi_api_utils op_info shape_inference) cc_test(infershape_utils_test SRCS infershape_utils_test.cc DEPS infershape_utils infermeta_utils meta_tensor) # Get the current working branch @@ -436,8 +436,8 @@ message(STATUS "branch: ${PADDLE_BRANCH}") configure_file(commit.h.in commit.h) -cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper pten_tensor op_meta_info pten_api) -cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry pten_custom_kernel pten_tensor_raw) +cc_library(custom_operator SRCS custom_operator.cc DEPS tensor attribute framework_proto op_registry operator dynamic_loader string_helper phi_tensor op_meta_info phi_api) +cc_library(custom_kernel SRCS custom_kernel.cc DEPS op_registry phi_custom_kernel phi_tensor_raw) #cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) #cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) @@ -450,7 +450,7 @@ if(WITH_TESTING AND TEST selected_rows_utils_test) endif() cc_test(scope_guard_test SRCS scope_guard_test.cc) -cc_test(pten_utils_test SRCS pten_utils_test.cc DEPS pten_utils) +cc_test(phi_utils_test SRCS pten_utils_test.cc DEPS phi_utils) if(WITH_GPU OR WITH_ROCM) cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info) diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index 72f7e5af9a96e..f198919b0c87b 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -1,11 +1,11 @@ cc_library(imperative_flag SRCS flags.cc DEPS gflags flags) -cc_library(var_helper SRCS var_helper.cc DEPS tensor pten_api) +cc_library(var_helper SRCS var_helper.cc DEPS tensor phi_api) IF(WITH_XPU) -cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS xpu_op_list proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper) ELSE() -cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils pten_api pten pten_utils var_helper) +cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator device_context lod_tensor selected_rows_utils var_type_traits op_kernel_type data_transform nan_inf_utils phi_api phi phi_utils var_helper) ENDIF() -cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper pten_api) +cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry var_helper phi_api) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer var_helper) cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal garbage_collector var_helper) @@ -47,9 +47,9 @@ if(WITH_GLOO) endif() if(NOT WITH_ASCEND_CL) -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function pten_tensor) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function phi_tensor) else() -cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner pten_tensor) +cc_library(gradient_accumulator SRCS gradient_accumulator.cc DEPS blas operator lod_tensor selected_rows_utils selected_rows_functor var_type_traits layer math_function npu_op_runner phi_tensor) endif() add_subdirectory(tests) diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt index a9c81cb87798b..e4f1cfdb3baee 100644 --- a/paddle/fluid/imperative/tests/CMakeLists.txt +++ b/paddle/fluid/imperative/tests/CMakeLists.txt @@ -15,7 +15,7 @@ else() endif(WIN32) -cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function pten_tensor pten_api pten_api_utils) +cc_test(test_gradient_accmulator SRCS test_gradient_accmulator.cc DEPS memcpy selected_rows_utils selected_rows_functor gradient_accumulator math_function phi_tensor phi_api phi_api_utils) cc_test(test_layer SRCS test_layer.cc DEPS layer proto_desc operator op_registry variable_helper mul_op memcpy) cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info split_op layer concat_and_split activation_op place) cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy) diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 887bd52bae547..26b8b9e8e17e0 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -35,7 +35,7 @@ endif() # fluid_modules exclude API-interface of inference/api and inference/capi_exp get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) -get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES) +get_property(phi_modules GLOBAL PROPERTY PHI_MODULES) set(utils_modules stringpiece pretty_log string_helper) add_subdirectory(api) @@ -47,11 +47,11 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg}) #TODO(wilber, T8T9): Do we still need to support windows gpu static library? if(WIN32 AND WITH_GPU) - cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) + cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) elseif(WITH_IPU) - cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu) + cc_library(paddle_inference DEPS ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules} paddle_ipu) else() - create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules}) + create_static_lib(paddle_inference ${fluid_modules} ${phi_modules} ${STATIC_INFERENCE_API} ${utils_modules}) endif() if(NOT APPLE) @@ -81,7 +81,7 @@ set(SHARED_INFERENCE_SRCS ${PADDLE_CUSTOM_OP_SRCS}) # shared inference library deps -set(SHARED_INFERENCE_DEPS ${fluid_modules} ${pten_modules} analysis_predictor) +set(SHARED_INFERENCE_DEPS ${fluid_modules} ${phi_modules} analysis_predictor) if (WITH_CRYPTO) set(SHARED_INFERENCE_DEPS ${SHARED_INFERENCE_DEPS} paddle_crypto) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a279c76430f1b..91a0352e1915e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -100,7 +100,7 @@ else() cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor) endif() -set(OP_HEADER_DEPS ${OP_HEADER_DEPS} pten pten_api_utils gather_scatter_kernel) +set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel) register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 37709c953e13b..04c8a329e5e1a 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -28,7 +28,7 @@ cc_library(denormal SRCS denormal.cc DEPS) cc_test(errors_test SRCS errors_test.cc DEPS errors enforce) -set(enforce_deps flags errors boost flags pten_enforce) +set(enforce_deps flags errors boost flags phi_enforce) if(WITH_GPU) set(enforce_deps ${enforce_deps} external_error_proto) endif() @@ -52,7 +52,7 @@ ELSE() cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) ENDIF() -cc_library(place SRCS place.cc DEPS enforce boost pten_place) +cc_library(place SRCS place.cc DEPS enforce boost phi_place) cc_test(place_test SRCS place_test.cc DEPS place glog gflags) IF(WITH_MKLDNN) @@ -122,7 +122,7 @@ cc_library(init SRCS init.cc DEPS device_context custom_kernel) # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies cc_library(device_context SRCS device_context.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS} - place pten_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} + place phi_place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS} ${MLU_CTX_DEPS} eigen3 cpu_context generator) if(WITH_XPU) target_link_libraries(device_context xpu_context) @@ -138,7 +138,7 @@ if(WITH_CNCL) endif() if(WITH_GPU OR WITH_ROCM) - target_link_libraries(device_context gpu_info gpu_context pten_gpu_info) + target_link_libraries(device_context gpu_info gpu_context phi_gpu_info) target_link_libraries(device_context gpu_resource_pool) endif() if (WITH_CUSTOM_DEVICE) diff --git a/paddle/fluid/platform/device/gpu/CMakeLists.txt b/paddle/fluid/platform/device/gpu/CMakeLists.txt index 00f0cc2ac92bf..f7c13ec7ed5ed 100644 --- a/paddle/fluid/platform/device/gpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/CMakeLists.txt @@ -1,12 +1,12 @@ IF(WITH_GPU) add_subdirectory(cuda) - nv_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda) + nv_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) nv_test(cuda_helper_test SRCS cuda_helper_test.cu) nv_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) ELSEIF(WITH_ROCM) add_subdirectory(rocm) - hip_library(gpu_info SRCS gpu_info.cc DEPS pten_gpu_info gflags glog enforce monitor dynload_cuda) + hip_library(gpu_info SRCS gpu_info.cc DEPS phi_gpu_info gflags glog enforce monitor dynload_cuda) hip_test(cuda_helper_test SRCS cuda_helper_test.cu) hip_test(cudnn_desc_test SRCS cudnn_desc_test.cc DEPS dynload_cuda) diff --git a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt index 8f7fd3dcbc03a..85050038d5a83 100644 --- a/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt +++ b/paddle/fluid/platform/device/gpu/cuda/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) -nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda pten) +nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda phi) diff --git a/paddle/fluid/platform/device/xpu/CMakeLists.txt b/paddle/fluid/platform/device/xpu/CMakeLists.txt index 28573eb0c1e4c..b6a26f2554a13 100644 --- a/paddle/fluid/platform/device/xpu/CMakeLists.txt +++ b/paddle/fluid/platform/device/xpu/CMakeLists.txt @@ -5,7 +5,7 @@ endif() set(XPU_CTX_DEPS xpulib ssl crypto rt z resolv dl) -cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place pten_xpu_info) +cc_library(xpu_info SRCS xpu_info.cc DEPS gflags glog enforce xpulib device_context place phi_xpu_info) cc_library(xpu_op_list SRCS xpu_op_list.cc DEPS gflags glog enforce xpulib device_context op_kernel_type) add_subdirectory(tests) diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index 49391a65b185b..87aa5dcde626b 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce pten_dynamic_loader) +cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce phi_dynamic_loader) list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) @@ -34,24 +34,24 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) if(WITH_ROCM) - hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader pten_dynload_cuda) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc) + hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader phi_dynload_cuda) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc) elseif (WITH_ASCEND_CL) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl pten_dynload_warpctc) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc npu_hccl phi_dynload_warpctc) else() - nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader pten_dynload_cuda) - cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc pten_dynload_warpctc) + nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader phi_dynload_cuda) + cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc phi_dynload_warpctc) endif() if (WITH_MKLML) - cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml pten_dynload_mklml) + cc_library(dynload_mklml SRCS mklml.cc DEPS dynamic_loader mklml phi_dynload_mklml) endif() -cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader pten_dynload_lapack) +cc_library(dynload_lapack SRCS lapack.cc DEPS dynamic_loader phi_dynload_lapack) add_dependencies(dynload_lapack extern_lapack) # TODO(TJ): add iomp, mkldnn? if (MKL_FOUND AND WITH_ONEMKL) message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") - cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader pten_dynload_mklrt) + cc_library(dynload_mklrt SRCS mklrt.cc DEPS dynamic_loader phi_dynload_mklrt) target_include_directories(dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index e76183192bcee..1f06eda8a2ee5 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,7 +2,7 @@ set(PYBIND_DEPS init pybind python proto_desc memory executor fleet_wrapper box_ feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model cuda_graph_with_memory_pool fleet_executor global_utils pten_utils tcp_store) + cost_model cuda_graph_with_memory_pool fleet_executor global_utils phi_utils tcp_store) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) @@ -299,7 +299,7 @@ if(WITH_PYTHON) if(NOT ON_INFER) cc_library(paddle_eager SRCS eager.cc eager_functions.cc eager_method.cc eager_properties.cc eager_utils.cc - DEPS eager_api autograd_meta backward grad_node_info pten op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python) + DEPS eager_api autograd_meta backward grad_node_info phi op_function_common final_dygraph_function final_dygraph_node dygraph_function dygraph_node accumulation_node global_utils utils python) add_dependencies(paddle_eager eager_codegen) add_dependencies(paddle_eager eager_op_function_generator_cmd) list(APPEND PYBIND_DEPS paddle_eager) diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index 0f6dfb9d8f44e..f2a78db558ee2 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -97,8 +97,9 @@ set(infrt_mlir_incs rewrite_inc trt_ops_inc ) + if (INFRT_WITH_PHI) - set(phi_libs pten) + set(phi_libs phi) set(infrt_mlir_incs ${infrt_mlir_incs} MLIRinfrt_phi_tensorIncGen MLIRinfrt_phi_baseIncGen diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt index e21cacfbc10b3..30a2621f4abdf 100644 --- a/paddle/infrt/kernel/phi/CMakeLists.txt +++ b/paddle/infrt/kernel/phi/CMakeLists.txt @@ -25,7 +25,7 @@ add_custom_command( cc_library(infrt_naive SRCS infershaped/infershaped_kernel_launcher.cc infershaped/infershaped_kernel_launchers.cc - DEPS pten wrapped_infermeta) + DEPS phi wrapped_infermeta) cc_test_tiny(test_infrt_infershape_launchers SRCS infershaped/infershape_launchers_test.cc DEPS infrt) diff --git a/paddle/phi/CMakeLists.txt b/paddle/phi/CMakeLists.txt index cc95e0bf8fdcc..7b074d0ebb76d 100644 --- a/paddle/phi/CMakeLists.txt +++ b/paddle/phi/CMakeLists.txt @@ -1,5 +1,5 @@ -# pten auto cmake utils -include(pten) +# phi auto cmake utils +include(phi) # paddle experimental common components add_subdirectory(common) @@ -23,16 +23,16 @@ add_subdirectory(tools) add_subdirectory(tests) # make an unity target for compile deps -set(PTEN_DEPS convert_utils dense_tensor pten_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor) -get_property(pten_kernels GLOBAL PROPERTY PTEN_KERNELS) +set(PHI_DEPS convert_utils dense_tensor phi_context kernel_factory kernel_context arg_map_context infermeta lod_utils op_compat_infos sparse_csr_tensor sparse_coo_tensor) +get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) # keep this message for debug, remove it later if needless -message(STATUS "All standard pten kernels: ${pten_kernels}") -set(PTEN_DEPS ${PTEN_DEPS} ${pten_kernels}) +message(STATUS "All standard phi kernels: ${phi_kernels}") +set(PHI_DEPS ${PHI_DEPS} ${phi_kernels}) -cc_library(pten DEPS ${PTEN_DEPS}) +cc_library(phi DEPS ${PHI_DEPS}) -set(pten_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file") -file(WRITE ${pten_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n") +set(phi_extension_header_file ${CMAKE_CURRENT_SOURCE_DIR}/extension.h CACHE INTERNAL "phi/extension.h file") +file(WRITE ${phi_extension_header_file} "// Header file generated by paddle/phi/CMakeLists.txt for external users,\n// DO NOT edit or include it within paddle.\n\n#pragma once\n\n") # generate inner headers include dir for users generate_unify_header(backends) diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt index a993cb3ff8041..c2ba5d406ba7b 100644 --- a/paddle/phi/api/CMakeLists.txt +++ b/paddle/phi/api/CMakeLists.txt @@ -1,2 +1,2 @@ add_subdirectory(lib) -cc_library(pten_api SRCS all.cc DEPS pten_function_api pten_bw_function_api manual_api sparse_api) +cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api manual_api sparse_api) diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index 1ebddc3d3cd1b..d50f62d309066 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -3,11 +3,11 @@ add_subdirectory(utils) cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) if (WITH_GPU) - nv_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api) + nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) elseif (WITH_ROCM) - hip_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api) + hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) else() - cc_library(pten_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor pten_api_utils ext_compat_utils pten_enforce manual_api) + cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) endif() set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) @@ -83,17 +83,17 @@ add_custom_command( DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base} VERBATIM) -cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS pten_tensor_raw pten_context kernel_factory) -cc_library(pten_data_transform SRCS data_transform.cc DEPS pten_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) -cc_library(manual_api SRCS manual_api.cc DEPS pten_tensor_raw pten kernel_dispatch pten_data_transform) +cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) +cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) +cc_library(manual_api SRCS manual_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(pten_tensor SRCS tensor_method.cc DEPS pten_tensor_raw pten_function_api) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) -cc_library(op_meta_info SRCS op_meta_info.cc DEPS pten_tensor) +cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor) -cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS pten) +cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) -cc_library(sparse_api SRCS sparse_api.cc DEPS pten_tensor pten kernel_dispatch pten_data_transform) -cc_library(pten_function_api SRCS ${api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform wrapped_infermeta) -cc_library(pten_dygraph_api SRCS ${dygraph_api_source_file} DEPS pten_tensor pten kernel_dispatch pten_data_transform) -cc_library(pten_bw_function_api SRCS ${bw_api_source_file} DEPS pten_tensor pten kernel_dispatch backward_infermeta pten_data_transform pten_function_api) +cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor phi kernel_dispatch phi_data_transform) +cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor phi kernel_dispatch phi_data_transform wrapped_infermeta) +cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor phi kernel_dispatch phi_data_transform) +cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api) diff --git a/paddle/phi/api/lib/utils/CMakeLists.txt b/paddle/phi/api/lib/utils/CMakeLists.txt index 74ecb3cd65262..6d056b54b7005 100644 --- a/paddle/phi/api/lib/utils/CMakeLists.txt +++ b/paddle/phi/api/lib/utils/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(pten_api_utils SRCS storage.cc tensor_utils.cc DEPS +cc_library(phi_api_utils SRCS storage.cc tensor_utils.cc DEPS tensor_base convert_utils dense_tensor lod_tensor selected_rows_utils place var_type_traits) diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt index 38366d57841b0..43e477ef32e9c 100644 --- a/paddle/phi/backends/CMakeLists.txt +++ b/paddle/phi/backends/CMakeLists.txt @@ -12,16 +12,16 @@ if(WITH_XPU) add_subdirectory(xpu) endif() -cc_library(pten_context SRCS all_context.cc DEPS device_context cpu_context) +cc_library(phi_context SRCS all_context.cc DEPS device_context cpu_context) if(WITH_XPU) - add_dependencies(pten_context xpu_context) + add_dependencies(phi_context xpu_context) endif() if(WITH_GPU) - add_dependencies(pten_context gpu_context) + add_dependencies(phi_context gpu_context) endif() if(WITH_CUSTOM_DEVICE) - add_dependencies(pten_context custom_context) + add_dependencies(phi_context custom_context) endif() diff --git a/paddle/phi/backends/cpu/CMakeLists.txt b/paddle/phi/backends/cpu/CMakeLists.txt index 965b33f3800ed..82ea42566fc1f 100644 --- a/paddle/phi/backends/cpu/CMakeLists.txt +++ b/paddle/phi/backends/cpu/CMakeLists.txt @@ -1,6 +1,6 @@ if(WITH_MKLDNN) # TODO(wilber): support mkldnn context. - cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context mkldnn eigen3) + cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context mkldnn eigen3) else() - cc_library(cpu_context SRCS cpu_context.cc DEPS pten_device_context eigen3) + cc_library(cpu_context SRCS cpu_context.cc DEPS phi_device_context eigen3) endif() diff --git a/paddle/phi/backends/custom/CMakeLists.txt b/paddle/phi/backends/custom/CMakeLists.txt index 9a7de35dd4e66..cb54d3675687d 100644 --- a/paddle/phi/backends/custom/CMakeLists.txt +++ b/paddle/phi/backends/custom/CMakeLists.txt @@ -1,3 +1,3 @@ if (WITH_CUSTOM_DEVICE) - cc_library(custom_context SRCS custom_context.cc DEPS pten_device_context device_manager) + cc_library(custom_context SRCS custom_context.cc DEPS phi_device_context device_manager) endif() diff --git a/paddle/phi/backends/dynload/CMakeLists.txt b/paddle/phi/backends/dynload/CMakeLists.txt index b7242fc76df7c..bc5ef3cd5c078 100644 --- a/paddle/phi/backends/dynload/CMakeLists.txt +++ b/paddle/phi/backends/dynload/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(pten_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags) +cc_library(phi_dynamic_loader SRCS dynamic_loader.cc DEPS enforce glog gflags) list(APPEND CUDA_SRCS cublas.cc cublasLt.cc cudnn.cc curand.cc cusolver.cc cusparse.cc nvtx.cc cufft.cc) @@ -34,24 +34,24 @@ if (CUPTI_FOUND) list(APPEND CUDA_SRCS cupti.cc) endif(CUPTI_FOUND) if(WITH_ROCM) - hip_library(pten_dynload_cuda SRCS ${HIP_SRCS} DEPS pten_dynamic_loader) - cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc) + hip_library(phi_dynload_cuda SRCS ${HIP_SRCS} DEPS phi_dynamic_loader) + cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc) elseif (WITH_ASCEND_CL) - cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc npu_hccl) + cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc npu_hccl) else() - nv_library(pten_dynload_cuda SRCS ${CUDA_SRCS} DEPS pten_dynamic_loader) - cc_library(pten_dynload_warpctc SRCS warpctc.cc DEPS pten_dynamic_loader warpctc) + nv_library(phi_dynload_cuda SRCS ${CUDA_SRCS} DEPS phi_dynamic_loader) + cc_library(phi_dynload_warpctc SRCS warpctc.cc DEPS phi_dynamic_loader warpctc) endif() if (WITH_MKLML) - cc_library(pten_dynload_mklml SRCS mklml.cc DEPS pten_dynamic_loader mklml) + cc_library(phi_dynload_mklml SRCS mklml.cc DEPS phi_dynamic_loader mklml) endif() -cc_library(pten_dynload_lapack SRCS lapack.cc DEPS pten_dynamic_loader) -add_dependencies(pten_dynload_lapack extern_lapack) +cc_library(phi_dynload_lapack SRCS lapack.cc DEPS phi_dynamic_loader) +add_dependencies(phi_dynload_lapack extern_lapack) # TODO(TJ): add iomp, mkldnn? if (MKL_FOUND AND WITH_ONEMKL) message("ONEMKL INCLUDE directory is ${MKL_INCLUDE}") - cc_library(pten_dynload_mklrt SRCS mklrt.cc DEPS pten_dynamic_loader) - target_include_directories(pten_dynload_mklrt PRIVATE ${MKL_INCLUDE}) + cc_library(phi_dynload_mklrt SRCS mklrt.cc DEPS phi_dynamic_loader) + target_include_directories(phi_dynload_mklrt PRIVATE ${MKL_INCLUDE}) endif() diff --git a/paddle/phi/backends/gpu/CMakeLists.txt b/paddle/phi/backends/gpu/CMakeLists.txt index 09591f79ae8fc..d14e94024f90f 100644 --- a/paddle/phi/backends/gpu/CMakeLists.txt +++ b/paddle/phi/backends/gpu/CMakeLists.txt @@ -1,9 +1,9 @@ if(WITH_GPU) add_subdirectory(cuda) - nv_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_cuda_info gflags glog enforce pten_dynload_cuda) + nv_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_cuda_info gflags glog enforce phi_dynload_cuda) elseif(WITH_ROCM) add_subdirectory(rocm) - hip_library(pten_gpu_info SRCS gpu_info.cc DEPS pten_rocm_info gflags glog enforce pten_dynload_cuda) + hip_library(phi_gpu_info SRCS gpu_info.cc DEPS phi_rocm_info gflags glog enforce phi_dynload_cuda) endif() -cc_library(gpu_context SRCS gpu_context.cc DEPS pten_device_context pten_gpu_info eigen3) +cc_library(gpu_context SRCS gpu_context.cc DEPS phi_device_context phi_gpu_info eigen3) diff --git a/paddle/phi/backends/gpu/cuda/CMakeLists.txt b/paddle/phi/backends/gpu/cuda/CMakeLists.txt index 7eb1983a793bc..a3393f97d7559 100644 --- a/paddle/phi/backends/gpu/cuda/CMakeLists.txt +++ b/paddle/phi/backends/gpu/cuda/CMakeLists.txt @@ -1 +1 @@ -nv_library(pten_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce pten_dynload_cuda) +nv_library(phi_cuda_info SRCS cuda_info.cc DEPS gflags glog enforce phi_dynload_cuda) diff --git a/paddle/phi/backends/gpu/rocm/CMakeLists.txt b/paddle/phi/backends/gpu/rocm/CMakeLists.txt index 181f92cbfc31c..257e4cc8afbcf 100644 --- a/paddle/phi/backends/gpu/rocm/CMakeLists.txt +++ b/paddle/phi/backends/gpu/rocm/CMakeLists.txt @@ -1 +1 @@ -hip_library(pten_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce pten_dynload_cuda) +hip_library(phi_rocm_info SRCS rocm_info.cc DEPS gflags glog enforce phi_dynload_cuda) diff --git a/paddle/phi/backends/xpu/CMakeLists.txt b/paddle/phi/backends/xpu/CMakeLists.txt index 65341dd206fd3..4d885757bb1a6 100644 --- a/paddle/phi/backends/xpu/CMakeLists.txt +++ b/paddle/phi/backends/xpu/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(pten_xpu_info SRCS xpu_info.cc DEPS enforce xpulib pten_place) -cc_library(xpu_context SRCS xpu_context.cc DEPS pten_device_context pten_xpu_info) +cc_library(phi_xpu_info SRCS xpu_info.cc DEPS enforce xpulib phi_place) +cc_library(xpu_context SRCS xpu_context.cc DEPS phi_device_context phi_xpu_info) diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index feaf0e12bdb16..85a1424ee34e0 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1 +1 @@ -cc_library(pten_place SRCS place.cc) +cc_library(phi_place SRCS place.cc) diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index 6ada063069905..d3c206c99dc22 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -2,30 +2,30 @@ add_subdirectory(compat) cc_library(errors SRCS errors.cc) -set(pten_enforce_deps errors flags) +set(phi_enforce_deps errors flags) if(WITH_GPU) - set(pten_enforce_deps ${pten_enforce_deps} external_error_proto) + set(phi_enforce_deps ${phi_enforce_deps} external_error_proto) endif() -cc_library(pten_enforce INTERFACE SRCS enforce.cc DEPS ${pten_enforce_deps}) +cc_library(phi_enforce INTERFACE SRCS enforce.cc DEPS ${phi_enforce_deps}) -cc_library(kernel_factory SRCS kernel_factory.cc DEPS pten_enforce fluid_convert_utils) -cc_library(kernel_context SRCS kernel_context.cc DEPS pten_enforce pten_context) +cc_library(kernel_factory SRCS kernel_factory.cc DEPS phi_enforce fluid_convert_utils) +cc_library(kernel_context SRCS kernel_context.cc DEPS phi_enforce phi_context) -cc_library(ddim SRCS ddim.cc DEPS pten_enforce) -cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS pten_enforce) -cc_library(tensor_meta SRCS tensor_meta.cc DEPS pten_enforce) -cc_library(lod_utils SRCS lod_utils.cc DEPS pten_enforce) +cc_library(ddim SRCS ddim.cc DEPS phi_enforce) +cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce) +cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce) +cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce) -cc_library(pten_device_context SRCS device_context.cc DEPS tensor_base) +cc_library(phi_device_context SRCS device_context.cc DEPS tensor_base) cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base) cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base) cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base) cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) -cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor pten_enforce ddim memcpy) +cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) -cc_library(pten_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) +cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) # Will remove once we implemented MKLDNN_Tensor if(WITH_MKLDNN) diff --git a/paddle/phi/core/compat/CMakeLists.txt b/paddle/phi/core/compat/CMakeLists.txt index c6bc9e15a535b..3423e380970df 100644 --- a/paddle/phi/core/compat/CMakeLists.txt +++ b/paddle/phi/core/compat/CMakeLists.txt @@ -1,14 +1,14 @@ -cc_library(arg_map_context SRCS arg_map_context.cc DEPS pten_enforce) +cc_library(arg_map_context SRCS arg_map_context.cc DEPS phi_enforce) cc_library(op_utils SRCS op_utils.cc DEPS arg_map_context enforce) set(convert_utils_deps data_type place op_utils) if(WITH_GPU) - set(convert_utils_deps ${convert_utils_deps} pten_gpu_info) + set(convert_utils_deps ${convert_utils_deps} phi_gpu_info) elseif(WITH_ROCM) - set(convert_utils_deps ${convert_utils_deps} pten_gpu_info) + set(convert_utils_deps ${convert_utils_deps} phi_gpu_info) elseif(WITH_XPU) - set(convert_utils_deps ${convert_utils_deps} pten_xpu_info) + set(convert_utils_deps ${convert_utils_deps} phi_xpu_info) endif() if(WITH_CUSTOM_DEVICE) set(convert_utils_deps ${convert_utils_deps} device_manager) diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index ef085e71f5dcc..4a79f191c23b3 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -3,22 +3,22 @@ set(kernel_declare_file_final ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declaratio file(WRITE ${kernel_declare_file} "// Generated by the paddle/phi/kernels/CMakeLists.txt. DO NOT EDIT!\n\n#pragma once\n\n") file(APPEND ${kernel_declare_file} "#include \"paddle/phi/core/kernel_registry.h\"\n\n") -# pten functors and functions called by kernels +# phi functors and functions called by kernels add_subdirectory(funcs) -# pten depends all pten kernel targets -set_property(GLOBAL PROPERTY PTEN_KERNELS "") +# phi depends all phi kernel targets +set_property(GLOBAL PROPERTY PHI_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor) # remove this dep after removing fluid deps on tensor creation -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} pten_api_utils) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) # auto build kernel targets by cmake register_kernels(DEPS ${COMMON_KERNEL_DEPS}) -# pten sparse kernels +# phi sparse kernels add_subdirectory(sparse) copy_if_different(${kernel_declare_file} ${kernel_declare_file_final}) diff --git a/paddle/phi/tests/CMakeLists.txt b/paddle/phi/tests/CMakeLists.txt index ab5da613199be..3bc13e55eb8a2 100644 --- a/paddle/phi/tests/CMakeLists.txt +++ b/paddle/phi/tests/CMakeLists.txt @@ -2,4 +2,4 @@ add_subdirectory(api) add_subdirectory(common) add_subdirectory(core) add_subdirectory(kernels) -add_subdirectory(ops_signature) +add_subdirectory(ops) diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index d875dbd4444ae..ba3fe8d57b31b 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -1,27 +1,27 @@ if(WITH_ROCM) - hip_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog) + hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api manual_api glog) else() - cc_test(test_pten_tensor SRCS test_pten_tensor.cc DEPS pten_tensor pten_function_api manual_api glog) + cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api manual_api glog) endif() -cc_test(test_pten_exception SRCS test_pten_exception.cc DEPS gtest) +cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest) -cc_test(test_mean_api SRCS test_mean_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_dot_api SRCS test_dot_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_empty_api SRCS test_empty_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_fill_api SRCS test_fill_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_cast_api SRCS test_cast_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_to_api SRCS test_to_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_slice_api SRCS test_slice_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_sum_api SRCS test_sum_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_scale_api SRCS test_scale_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_conj_api SRCS test_conj_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_concat_api SRCS test_concat_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_split_api SRCS test_split_api.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_data_transform SRCS test_data_transform.cc DEPS pten_tensor pten_api pten_api_utils) -cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS pten_tensor pten_api pten_api_utils) +cc_test(test_mean_api SRCS test_mean_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_dot_api SRCS test_dot_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_empty_api SRCS test_empty_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_fill_api SRCS test_fill_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_elementwise_api SRCS test_elementwise_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_cast_api SRCS test_cast_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_reshape_api SRCS test_reshape_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_to_api SRCS test_to_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_slice_api SRCS test_slice_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_sum_api SRCS test_sum_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_scale_api SRCS test_scale_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_scale_benchmark SRCS test_scale_benchmark.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_conj_api SRCS test_conj_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_concat_api SRCS test_concat_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_split_api SRCS test_split_api.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_data_transform SRCS test_data_transform.cc DEPS phi_tensor phi_api phi_api_utils) +cc_test(test_sparse_utils_api SRCS test_sparse_utils_api.cc DEPS phi_tensor phi_api phi_api_utils) diff --git a/paddle/phi/tests/common/CMakeLists.txt b/paddle/phi/tests/common/CMakeLists.txt index f54b37cb976c5..710ea3c066472 100644 --- a/paddle/phi/tests/common/CMakeLists.txt +++ b/paddle/phi/tests/common/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(pten_test_backend SRCS test_backend.cc DEPS gtest) -cc_test(pten_test_data_layout SRCS test_data_layout.cc DEPS gtest) -cc_test(pten_test_data_type SRCS test_data_type.cc DEPS gtest) -cc_test(pten_test_place SRCS test_place.cc DEPS pten_place) +cc_test(phi_test_backend SRCS test_backend.cc DEPS gtest) +cc_test(phi_test_data_layout SRCS test_data_layout.cc DEPS gtest) +cc_test(phi_test_data_type SRCS test_data_type.cc DEPS gtest) +cc_test(phi_test_place SRCS test_place.cc DEPS phi_place) diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt index 576ab7ffe6a66..5356bac9fbd80 100644 --- a/paddle/phi/tests/core/CMakeLists.txt +++ b/paddle/phi/tests/core/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS pten_custom_kernel) +cc_test(test_custom_kernel SRCS test_custom_kernel.cc DEPS phi_custom_kernel) cc_test(test_dense_tensor SRCS test_dense_tensor.cc DEPS dense_tensor) cc_test(test_intrusive_ptr SRCS test_intrusive_ptr.cc) cc_test(test_type_info SRCS test_type_info.cc) @@ -6,7 +6,7 @@ cc_test(test_kernel_factory SRCS test_kernel_factory.cc DEPS kernel_factory scal cc_test(test_sparse_coo_tensor SRCS test_sparse_coo_tensor.cc DEPS dense_tensor sparse_coo_tensor) cc_test(test_sparse_csr_tensor SRCS test_sparse_csr_tensor.cc DEPS dense_tensor sparse_csr_tensor) cc_test(test_op_utils SRCS test_op_utils.cc DEPS op_compat_infos) -cc_test(test_pten_device_context SRCS test_device_context.cc DEPS pten_context cpu_context) +cc_test(test_phi_device_context SRCS test_device_context.cc DEPS phi_context cpu_context) cc_test(test_meta_fn_utils SRCS test_meta_fn_utils.cc DEPS dense_tensor wrapped_infermeta infermeta infermeta_utils) cc_test(test_ddim SRCS test_ddim.cc DEPS ddim) diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt index 9682e063471df..35137aa474e93 100644 --- a/paddle/phi/tests/kernels/CMakeLists.txt +++ b/paddle/phi/tests/kernels/CMakeLists.txt @@ -1,18 +1,18 @@ -cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS pten pten_api_utils) -cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS pten pten_api_utils) +cc_test(test_copy_dev_api SRCS test_copy_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_dot_dev_api SRCS test_dot_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_creation_dev_api SRCS test_creation_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_flatten_dev_api SRCS test_flatten_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_matmul_dev_api SRCS test_matmul_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_mean_dev_api SRCS test_mean_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_scale_dev_api SRCS test_scale_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_cast_dev_api SRCS test_cast_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_elementwise_dev_api SRCS test_elementwise_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_reshape_dev_api SRCS test_reshape_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_sum_dev_api SRCS test_sum_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_conj_dev_api SRCS test_conj_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_concat_dev_api SRCS test_concat_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_split_dev_api SRCS test_split_dev_api.cc DEPS phi phi_api_utils) +cc_test(test_sparse_utils_dev_api SRCS test_sparse_utils_dev_api.cc DEPS phi phi_api_utils) cc_test(test_math_function SRCS test_math_function.cc DEPS math_function) if(WITH_GPU) diff --git a/paddle/phi/tests/ops_signature/CMakeLists.txt b/paddle/phi/tests/ops/CMakeLists.txt similarity index 100% rename from paddle/phi/tests/ops_signature/CMakeLists.txt rename to paddle/phi/tests/ops/CMakeLists.txt diff --git a/paddle/phi/tests/ops_signature/test_op_signature.cc b/paddle/phi/tests/ops/test_op_signature.cc similarity index 98% rename from paddle/phi/tests/ops_signature/test_op_signature.cc rename to paddle/phi/tests/ops/test_op_signature.cc index 203517c75069d..a6c9a27de7dc5 100644 --- a/paddle/phi/tests/ops_signature/test_op_signature.cc +++ b/paddle/phi/tests/ops/test_op_signature.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/tests/ops_signature/test_op_signature.h" +#include "paddle/phi/tests/ops/test_op_signature.h" #include #include diff --git a/paddle/phi/tests/ops_signature/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h similarity index 100% rename from paddle/phi/tests/ops_signature/test_op_signature.h rename to paddle/phi/tests/ops/test_op_signature.h diff --git a/paddle/phi/tools/CMakeLists.txt b/paddle/phi/tools/CMakeLists.txt index bc690d3a9f193..5693a46d97721 100644 --- a/paddle/phi/tools/CMakeLists.txt +++ b/paddle/phi/tools/CMakeLists.txt @@ -1,5 +1,5 @@ add_executable(print_pten_kernels print_pten_kernels.cc) -target_link_libraries(print_pten_kernels pten pten_api_utils) +target_link_libraries(print_pten_kernels phi phi_api_utils) if(WIN32) target_link_libraries(print_pten_kernels shlwapi.lib) endif() From b8cf8ca72b9bab5b8d951ad24d08b67232239fb5 Mon Sep 17 00:00:00 2001 From: WangXi Date: Fri, 25 Feb 2022 09:45:33 +0800 Subject: [PATCH 41/85] fill_constant_batch_size_like op support fp16 (#39907) --- paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc index 353f73cdd6d05..de06aeb01e4dd 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc @@ -18,6 +18,8 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpKernel, ops::FillConstantBatchSizeLikeOpKernel, ops::FillConstantBatchSizeLikeOpKernel Date: Fri, 25 Feb 2022 10:10:14 +0800 Subject: [PATCH 42/85] Fixed Python-C AutoCodeGen issues (#39897) --- .../auto_code_generator/final_state_generator/python_c_gen.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 5a536067dbe49..9329dc5ffc9dd 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -143,7 +143,7 @@ def GeneratePythonCFunction(fwd_api_name, forward_inputs_position_map, fwd_api_name, fwd_api_name, get_eager_tensor_str, parse_attributes_str, GetForwardFunctionName(fwd_api_name), dygraph_function_call_str) - python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}},\n" + python_c_function_reg_str = f"{{\"final_state_{fwd_api_name}\", (PyCFunction)(void(*)(void))eager_final_state_api_{fwd_api_name}, METH_VARARGS | METH_KEYWORDS, \"C++ interface function for {fwd_api_name} in dygraph.\"}}\n" return python_c_function_str, python_c_function_reg_str @@ -197,7 +197,7 @@ def GenerateCoreOpsInfoMap(): """ core_ops_infos_registry = """ - {\"get_final_state_core_ops_args_info\", + ,{\"get_final_state_core_ops_args_info\", (PyCFunction)(void(*)(void))eager_get_final_state_core_ops_args_info, METH_NOARGS, \"C++ interface function for eager_get_final_state_core_ops_args_info.\"}, {\"get_final_state_core_ops_args_type_info\", From 0615815d6fc6f43e16a0cec1c19996680028686a Mon Sep 17 00:00:00 2001 From: niuliling123 <51102941+niuliling123@users.noreply.github.com> Date: Fri, 25 Feb 2022 10:25:29 +0800 Subject: [PATCH 43/85] Fix a bug in IndexKernel data overflow (#39891) --- paddle/fluid/operators/index_impl.cu.h | 27 +++++++++++++------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/operators/index_impl.cu.h b/paddle/fluid/operators/index_impl.cu.h index 3d6a5e0ea88a2..2e3e6569ef5a8 100644 --- a/paddle/fluid/operators/index_impl.cu.h +++ b/paddle/fluid/operators/index_impl.cu.h @@ -31,24 +31,24 @@ namespace operators { namespace kps = phi::kps; template -__global__ void VectorizedIndexKernel(T *out, int numel, int main_offset, +__global__ void VectorizedIndexKernel(T *out, size_t numel, size_t main_offset, Functor func) { - int data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; - int stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; - int args[VecSize]; + size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize; + size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize; + size_t args[VecSize]; T result[VecSize]; for (; data_offset < main_offset; data_offset += stride) { - kps::InitWithDataIndex(&args[0], data_offset); - kps::ElementwiseUnary(&result[0], &args[0], - func); + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary(&result[0], + &args[0], func); kps::WriteData(out + data_offset, &result[0], BLOCK_NUM_X * VecSize); } - int num = numel - data_offset; + size_t num = numel - data_offset; if (num > 0) { - kps::InitWithDataIndex(&args[0], data_offset); - kps::ElementwiseUnary(&result[0], &args[0], - func); + kps::InitWithDataIndex(&args[0], data_offset); + kps::ElementwiseUnary(&result[0], + &args[0], func); kps::WriteData(out + data_offset, &result[0], num); } } @@ -58,7 +58,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int numel = out->numel(); T *out_data = out->mutable_data(dev_ctx.GetPlace()); if (numel <= 0) return; - int vec_size = paddle::platform::GetVectorizedSize((out->data())); + int vec_size = paddle::platform::GetVectorizedSize(out_data); #ifdef PADDLE_WITH_XPU_KP int block = 64; int grid = 8; @@ -70,8 +70,7 @@ void IndexKernel(const KPDevice &dev_ctx, Tensor *out, Functor func) { int block = config.thread_per_block.x; auto stream = dev_ctx.stream(); #endif - - int main_offset = (numel / (vec_size * block)) * vec_size * block; + size_t main_offset = (numel / (vec_size * block)) * vec_size * block; switch (vec_size) { case 4: VectorizedIndexKernel<<>>( From 04d324b299433f0be11e86811678df5bf4c6cf17 Mon Sep 17 00:00:00 2001 From: fwenguang <95677191+fwenguang@users.noreply.github.com> Date: Fri, 25 Feb 2022 10:30:22 +0800 Subject: [PATCH 44/85] [MLU] add elementwise_mul mlu kernel (#39864) --- .../elementwise/elementwise_mul_op_mlu.cc | 169 ++++++++++++ .../mlu/test_elementwise_mul_op_mlu.py | 240 ++++++++++++++++++ 2 files changed, 409 insertions(+) create mode 100644 paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc new file mode 100644 index 0000000000000..a7505890f41d4 --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_mlu.cc @@ -0,0 +1,169 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/mlu/mlu_baseop.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using MLUDeviceContext = platform::MLUDeviceContext; + +static void GetReduceAxes(const int axis, const framework::DDim& src_ddims, + const framework::DDim& target_ddims, + std::vector* axes) { + int64_t src_dim_size = src_ddims.size(); + int64_t target_dim_size = target_ddims.size(); + for (int64_t i = 0; i < src_dim_size; ++i) { + if (i < axis || i >= target_dim_size + axis) { + axes->push_back(i); + continue; + } + if (src_ddims[i] > target_ddims[i - axis]) { + axes->push_back(i); + } + } +} + +template +class ElementwiseMulMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) + : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc out_desc(*out); + MLUCnnlOpTensorDesc op_tensor_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + + MLUCnnl::OpTensor(ctx, op_tensor_desc.get(), x_desc.get(), GetBasePtr(x), + y_desc.get(), GetBasePtr(y), out_desc.get(), + GetBasePtr(out), ToCnnlDataType()); + } +}; + +template +class ElementwiseMulGradMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + + const auto& x_dims = x->dims(); + const auto& y_dims = y->dims(); + axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) + : axis); + int max_dim = std::max(x_dims.size(), y_dims.size()); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(), + y_dims_array.data(), out_dims_array.data(), max_dim, + axis); + + MLUCnnlTensorDesc x_desc(max_dim, x_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc y_desc(max_dim, y_dims_array.data(), ToCnnlDataType()); + MLUCnnlTensorDesc dout_desc(*dout); + MLUCnnlOpTensorDesc mul_op_desc(CNNL_OP_TENSOR_MUL, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + if (dx->dims() == dout->dims()) { + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(), + GetBasePtr(dout), y_desc.get(), GetBasePtr(y), + x_desc.get(), GetBasePtr(dx), ToCnnlDataType()); + } else { + Tensor dx_temp(x->dtype()); + dx_temp.Resize(dout->dims()); + dx_temp.mutable_data(ctx.GetPlace()); + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(), + GetBasePtr(dout), y_desc.get(), GetBasePtr(y), + dout_desc.get(), GetBasePtr(&dx_temp), + ToCnnlDataType()); + + std::vector reduce_axes; + GetReduceAxes(axis, dx_temp.dims(), dx->dims(), &reduce_axes); + MLUCnnlReduceDesc reduction_desc( + reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dx_desc(*dx); + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), + nullptr, dout_desc.get(), GetBasePtr(&dx_temp), 0, + nullptr, nullptr, dx_desc.get(), GetBasePtr(dx)); + } + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + if (dy->dims() == dout->dims()) { + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(), + GetBasePtr(dout), x_desc.get(), GetBasePtr(x), + y_desc.get(), GetBasePtr(dy), ToCnnlDataType()); + } else { + Tensor dy_temp(y->dtype()); + dy_temp.Resize(dout->dims()); + dy_temp.mutable_data(ctx.GetPlace()); + MLUCnnl::OpTensor(ctx, mul_op_desc.get(), dout_desc.get(), + GetBasePtr(dout), x_desc.get(), GetBasePtr(x), + dout_desc.get(), GetBasePtr(&dy_temp), + ToCnnlDataType()); + + std::vector reduce_axes; + GetReduceAxes(axis, dy_temp.dims(), dy->dims(), &reduce_axes); + MLUCnnlReduceDesc reduction_desc( + reduce_axes, CNNL_REDUCE_ADD, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + MLUCnnlTensorDesc dy_desc(*dy); + MLUCnnl::Reduce(ctx, true /*need_workspace*/, reduction_desc.get(), + nullptr, dout_desc.get(), GetBasePtr(&dy_temp), 0, + nullptr, nullptr, dy_desc.get(), GetBasePtr(dy)); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_MLU_KERNEL(elementwise_mul, ops::ElementwiseMulMLUKernel, + ops::ElementwiseMulMLUKernel, + ops::ElementwiseMulMLUKernel); + +REGISTER_OP_MLU_KERNEL( + elementwise_mul_grad, ops::ElementwiseMulGradMLUKernel, + ops::ElementwiseMulGradMLUKernel, + ops::ElementwiseMulGradMLUKernel); diff --git a/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py new file mode 100644 index 0000000000000..bc8a08c39ffc8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_elementwise_mul_op_mlu.py @@ -0,0 +1,240 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid import Program, compiler, program_guard +from paddle.fluid.op import Operator + +import sys +sys.path.append('..') +from op_test import OpTest, skip_check_grad_ci + +paddle.enable_static() + + +class ElementwiseMulOp(OpTest): + def init_kernel_type(self): + self.__class__.use_mlu = True + self.place = paddle.device.MLUPlace(0) + + def setUp(self): + self.op_type = "elementwise_mul" + self.dtype = np.float32 + self.axis = -1 + self.init_dtype() + self.init_input_output() + self.init_kernel_type() + self.init_axis() + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) + } + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis} + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set('Y')) + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) + + def init_dtype(self): + pass + + def init_axis(self): + pass + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseMulOp_scalar(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 3, 4).astype(np.float32), + 'Y': np.random.rand(1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + self.init_kernel_type() + + +class TestElementwiseMulOp_Vector(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.random((100, )).astype("float32"), + 'Y': np.random.random((100, )).astype("float32") + } + self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])} + self.init_kernel_type() + + +class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x * self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 100, 3).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1) + } + self.init_kernel_type() + + +class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100) + } + self.init_kernel_type() + + +class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 10, 12, 3).astype(np.float32), + 'Y': np.random.rand(10, 12).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1) + } + self.init_kernel_type() + + +class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 2, 11).astype(np.float32), + 'Y': np.random.rand(10, 1, 11).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + self.init_kernel_type() + + +class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 4, 2, 3).astype(np.float32), + 'Y': np.random.rand(10, 4, 1, 3).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + self.init_kernel_type() + + +class TestElementwiseMulOpFp16(ElementwiseMulOp): + def init_dtype(self): + self.dtype = np.float16 + + +class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(1, 1, 100).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + self.init_kernel_type() + + +class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(30, 3, 1, 5).astype(np.float32), + 'Y': np.random.rand(30, 1, 4, 1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + self.init_kernel_type() + + +class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp): + def setUp(self): + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 10).astype(np.float32), + 'Y': np.random.rand(2, 2, 10, 10).astype(np.float32) + } + + self.attrs = {'axis': 2} + + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y'] + } + self.init_kernel_type() + + +class TestElementwiseMulOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # the input of elementwise_mul must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CPUPlace()) + self.assertRaises(TypeError, fluid.layers.elementwise_mul, x1, y1) + + # the input dtype of elementwise_mul must be float16 or float32 or int32 + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, fluid.layers.elementwise_mul, x2, y2) + + +if __name__ == '__main__': + unittest.main() From ef96ffb6bc9930cc48b37c29e688f07c0cab5a3a Mon Sep 17 00:00:00 2001 From: Li Min <11663212+limin2021@users.noreply.github.com> Date: Fri, 25 Feb 2022 10:31:36 +0800 Subject: [PATCH 45/85] [Fix bug] fix fp16 atomicAdd compiler error on different cuda_arch. (#39886) * Fix compile error on cuda_arch less than 700. --- paddle/fluid/platform/device/gpu/gpu_primitives.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h index 8616e969f69df..8aec8e840f332 100644 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h @@ -210,6 +210,12 @@ template ::value>::type * = nullptr> __device__ __forceinline__ void VectorizedAtomicAddPerBlock( const int64_t len, int tid, int threads_per_block, const T *in, T *out) { +#if ((CUDA_VERSION < 10000) || \ + (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) + for (int i = tid; i < len; i += threads_per_block) { + CudaAtomicAdd(&out[i], in[i]); + } +#else int i = 0; int loops = len / 2 * 2; @@ -233,6 +239,7 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock( fastAtomicAdd(out, i, len, in[i]); } } +#endif } #endif #endif From fed6de40475ea796a195c9471f86b193ec62c11c Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Fri, 25 Feb 2022 10:31:54 +0800 Subject: [PATCH 46/85] [Bug Fixes]Fix Bugs when construct infermeta by using shape(Vector) (#39904) * fix bugs * fix bugs --- paddle/fluid/framework/infershape_utils.cc | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 4bec1baeaaee9..0900ed2ff2f5d 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -348,17 +348,30 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } } else { // If is not in runtime, we will set default value(-1) for ScalarArray - int64_t num_ele = 1; + int64_t num_ele = 0; std::vector vars; vars.reserve(infershape_inputs.size()); for (size_t i = 0; i < infershape_inputs.size(); i++) { vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i])); } - for (auto& var : vars) { - const auto& tensor_dims = var->GetShape(); + + if (vars.size() == 1) { + num_ele = 1; + const auto& tensor_dims = vars[0]->GetShape(); for (size_t i = 0; i < tensor_dims.size(); ++i) { num_ele *= tensor_dims[i]; } + } else { + for (auto& var : vars) { + const auto& tensor_dims = var->GetShape(); + PADDLE_ENFORCE_EQ(tensor_dims.size(), 1, + platform::errors::InvalidArgument( + "The shape is constructed by multi-tensor, " + "every tensor's dims should be 1. But your " + "shape has tensor that dims is %s.", + tensor_dims.size())); + num_ele += tensor_dims[0]; + } } phi::ScalarArray tensor_attr(std::vector(num_ele, -1)); tensor_attr.SetFromTensor(true); From 8895379a0a9d1223480071d97befe71876272623 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 25 Feb 2022 10:32:27 +0800 Subject: [PATCH 47/85] [Phi] Support cudnn kernel moving & move softmax kernels (#39547) * support cudnn kernel moving * polish cmake rules * add unittest for coverage * remove orig kernel * remove softmax cudnn kernel * fix softmax test failed * fix npu func error * resolve conflict * rename gpu dnn kernels * fix name rule error * fix compile error * update fp16 namespace --- cmake/phi.cmake | 89 ++-- .../ir/mkldnn/mkldnn_inplace_pass_tester.cc | 2 +- paddle/fluid/framework/pten_utils.cc | 4 +- paddle/fluid/framework/pten_utils_test.cc | 37 +- .../inference/tensorrt/convert/softmax_op.cc | 2 +- .../tensorrt/convert/test_softmax_op.cc | 2 +- .../c_softmax_with_cross_entropy_op.cu | 9 +- .../c_softmax_with_cross_entropy_op.h | 1 - paddle/fluid/operators/fused/fmha_ref.h | 22 +- .../operators/margin_cross_entropy_op.cu | 9 +- .../fluid/operators/margin_cross_entropy_op.h | 1 - paddle/fluid/operators/math/softmax.cc | 8 + paddle/fluid/operators/math/softmax.cu | 11 + .../operators/mkldnn/softmax_mkldnn_op.cc | 9 +- .../operators/mkldnn/test_mkldnn_caching.cc | 2 +- .../mkldnn/test_mkldnn_op_inplace.cc | 2 +- paddle/fluid/operators/softmax_cudnn_op.cu | 72 --- paddle/fluid/operators/softmax_op.cc | 10 +- paddle/fluid/operators/softmax_op.cu.cc | 27 -- paddle/fluid/operators/softmax_op.h | 114 ----- paddle/fluid/operators/softmax_op_npu.cc | 5 +- paddle/fluid/operators/softmax_op_npu_test.cc | 2 +- paddle/fluid/operators/softmax_op_xpu.cc | 6 +- .../softmax_with_cross_entropy_op.cc | 4 +- .../softmax_with_cross_entropy_op.cu | 33 +- .../operators/softmax_with_cross_entropy_op.h | 21 +- .../softmax_with_cross_entropy_op_npu.cc | 22 +- .../softmax_with_cross_entropy_op_xpu.cc | 12 +- .../test_common_infer_shape_functions.cc | 2 +- paddle/phi/backends/gpu/gpu_context.h | 7 + paddle/phi/common/backend.h | 10 +- paddle/phi/common/float16.h | 12 + paddle/phi/core/compat/convert_utils.cc | 2 +- paddle/phi/kernels/CMakeLists.txt | 11 +- paddle/phi/kernels/cpu/softmax_grad_kernel.cc | 22 + paddle/phi/kernels/cpu/softmax_kernel.cc | 22 + paddle/phi/kernels/funcs/axis_utils.h | 54 +++ paddle/phi/kernels/funcs/concat_funcs.h | 2 +- paddle/phi/kernels/funcs/eigen/elementwise.cu | 2 +- paddle/phi/kernels/gpu/softmax_grad_kernel.cu | 28 ++ paddle/phi/kernels/gpu/softmax_kernel.cu | 28 ++ .../kernels/gpudnn/softmax_gpudnn.h} | 444 ++++++++++++------ .../gpudnn/softmax_grad_kernel_gpudnn.cu | 50 ++ .../kernels/gpudnn/softmax_kernel_gpudnn.cu | 49 ++ .../kernels/impl/softmax_grad_kernel_impl.h | 51 ++ paddle/phi/kernels/impl/softmax_kernel_impl.h | 48 ++ paddle/phi/kernels/softmax_grad_kernel.h | 29 ++ paddle/phi/kernels/softmax_kernel.h | 38 ++ paddle/phi/ops/compat/softmax_sig.cc | 34 ++ paddle/phi/tests/common/test_backend.cc | 6 +- 50 files changed, 996 insertions(+), 493 deletions(-) delete mode 100644 paddle/fluid/operators/softmax_cudnn_op.cu delete mode 100644 paddle/fluid/operators/softmax_op.cu.cc delete mode 100644 paddle/fluid/operators/softmax_op.h create mode 100644 paddle/phi/kernels/cpu/softmax_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/softmax_kernel.cc create mode 100644 paddle/phi/kernels/funcs/axis_utils.h create mode 100644 paddle/phi/kernels/gpu/softmax_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/softmax_kernel.cu rename paddle/{fluid/operators/softmax_cudnn_op.cu.h => phi/kernels/gpudnn/softmax_gpudnn.h} (63%) create mode 100644 paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu create mode 100644 paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu create mode 100644 paddle/phi/kernels/impl/softmax_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/softmax_kernel_impl.h create mode 100644 paddle/phi/kernels/softmax_grad_kernel.h create mode 100644 paddle/phi/kernels/softmax_kernel.h create mode 100644 paddle/phi/ops/compat/softmax_sig.cc diff --git a/cmake/phi.cmake b/cmake/phi.cmake index f1a6f8e45a74c..d9132b84455e7 100644 --- a/cmake/phi.cmake +++ b/cmake/phi.cmake @@ -81,6 +81,8 @@ function(kernel_declare TARGET_LIST) file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n") elseif (${kernel_path} MATCHES "./xpu\/") file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n") + elseif (${kernel_path} MATCHES "./gpudnn\/") + file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n") else () # deal with device independent kernel, now we use CPU temporaary file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n") @@ -94,6 +96,7 @@ function(kernel_library TARGET) set(cpu_srcs) set(gpu_srcs) set(xpu_srcs) + set(gpudnn_srcs) set(selected_rows_srcs) # parse and save the deps kerenl targets set(all_srcs) @@ -101,6 +104,8 @@ function(kernel_library TARGET) set(oneValueArgs SUB_DIR) set(multiValueArgs SRCS DEPS) + set(target_build_flag 1) + cmake_parse_arguments(kernel_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) @@ -123,6 +128,9 @@ function(kernel_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu) + endif() endif() if (WITH_XPU) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc) @@ -141,6 +149,7 @@ function(kernel_library TARGET) list(APPEND all_srcs ${cpu_srcs}) list(APPEND all_srcs ${gpu_srcs}) list(APPEND all_srcs ${xpu_srcs}) + list(APPEND all_srcs ${gpudnn_srcs}) foreach(src ${all_srcs}) file(READ ${src} target_content) string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content}) @@ -166,21 +175,22 @@ function(kernel_library TARGET) list(LENGTH cpu_srcs cpu_srcs_len) list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len) + list(LENGTH gpudnn_srcs gpudnn_srcs_len) list(LENGTH selected_rows_srcs selected_rows_srcs_len) # Build Target according different src organization if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR - ${xpu_srcs_len} GREATER 0) AND (${common_srcs_len} GREATER 0 OR - ${selected_rows_srcs_len} GREATER 0)) + ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND + (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)) # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule. if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) nv_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part) endif() else() @@ -190,14 +200,14 @@ function(kernel_library TARGET) endif() endif() # If there are only specific device srcs, build target using this rule. - elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) + elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) if (WITH_GPU) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() elseif (WITH_ROCM) - if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) - hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) + if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) + hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() else() if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) @@ -234,35 +244,40 @@ function(kernel_library TARGET) cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) endif() else() - message(FATAL_ERROR "Cannot find any implementation for ${TARGET}") + set(target_build_flag 0) endif() - if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR - ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR - ${selected_rows_srcs_len} GREATER 0) - # append target into PHI_KERNELS property - get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) - set(phi_kernels ${phi_kernels} ${TARGET}) - set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) - endif() + if (${target_build_flag} EQUAL 1) + if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR + ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR + ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0) + # append target into PHI_KERNELS property + get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS) + set(phi_kernels ${phi_kernels} ${TARGET}) + set_property(GLOBAL PROPERTY PHI_KERNELS ${phi_kernels}) + endif() - # parse kernel name and auto generate kernel declaration - # here, we don't need to check WITH_XXX, because if not WITH_XXX, the - # xxx_srcs_len will be equal to 0 - if (${common_srcs_len} GREATER 0) - kernel_declare(${common_srcs}) - endif() - if (${cpu_srcs_len} GREATER 0) - kernel_declare(${cpu_srcs}) - endif() - if (${gpu_srcs_len} GREATER 0) - kernel_declare(${gpu_srcs}) - endif() - if (${xpu_srcs_len} GREATER 0) - kernel_declare(${xpu_srcs}) - endif() - if (${selected_rows_srcs_len} GREATER 0) - kernel_declare(${selected_rows_srcs}) + # parse kernel name and auto generate kernel declaration + # here, we don't need to check WITH_XXX, because if not WITH_XXX, the + # xxx_srcs_len will be equal to 0 + if (${common_srcs_len} GREATER 0) + kernel_declare(${common_srcs}) + endif() + if (${cpu_srcs_len} GREATER 0) + kernel_declare(${cpu_srcs}) + endif() + if (${gpu_srcs_len} GREATER 0) + kernel_declare(${gpu_srcs}) + endif() + if (${xpu_srcs_len} GREATER 0) + kernel_declare(${xpu_srcs}) + endif() + if (${gpudnn_srcs_len} GREATER 0) + kernel_declare(${gpudnn_srcs}) + endif() + if (${selected_rows_srcs_len} GREATER 0) + kernel_declare(${selected_rows_srcs}) + endif() endif() endfunction() diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc index ea335e9bd63c6..0a95444f852dd 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_inplace_pass_tester.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/ir/pass_tester_helper.h" #include "paddle/fluid/framework/op_registry.h" -USE_OP(softmax); +USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/pten_utils.cc index 0ecc04dbd6b8d..af9d62ff7a845 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/pten_utils.cc @@ -67,7 +67,7 @@ OpKernelType TransPtenKernelKeyToOpKernelType( LibraryType library_type = LibraryType::kPlain; if (kernel_key.backend() == phi::Backend::MKLDNN) { library_type = LibraryType::kMKLDNN; - } else if (kernel_key.backend() == phi::Backend::CUDNN) { + } else if (kernel_key.backend() == phi::Backend::GPUDNN) { library_type = LibraryType::kCUDNN; } else { // do nothing @@ -82,7 +82,7 @@ phi::KernelKey TransOpKernelTypeToPtenKernelKey( if (kernel_type.library_type_ == LibraryType::kMKLDNN) { backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { - backend = phi::Backend::CUDNN; + backend = phi::Backend::GPUDNN; } else { // do } diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/pten_utils_test.cc index 3c86372e6e752..da1431c0efafe 100644 --- a/paddle/fluid/framework/pten_utils_test.cc +++ b/paddle/fluid/framework/pten_utils_test.cc @@ -42,7 +42,7 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { #endif #ifdef PADDLE_WITH_CUDA - phi::KernelKey kernel_key_cudnn(phi::Backend::CUDNN, phi::DataLayout::NCHW, + phi::KernelKey kernel_key_cudnn(phi::Backend::GPUDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32); op_kernel_type = paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn); @@ -53,3 +53,38 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { paddle::framework::LibraryType::kCUDNN); #endif } + +TEST(PtenUtils, TransOpKernelTypeToPtenKernelKey) { + paddle::framework::OpKernelType op_kernel_type( + paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), + paddle::framework::DataLayout::kNCHW); + auto kernel_key = + paddle::framework::TransOpKernelTypeToPtenKernelKey(op_kernel_type); + ASSERT_EQ(kernel_key.dtype(), phi::DataType::FLOAT32); + ASSERT_EQ(kernel_key.layout(), phi::DataLayout::NCHW); + ASSERT_EQ(kernel_key.backend(), phi::Backend::CPU); + +#ifdef PADDLE_WITH_MKLDNN + paddle::framework::OpKernelType op_kernel_type_mkldnn( + paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), + paddle::framework::DataLayout::kMKLDNN, + paddle::framework::LibraryType::kMKLDNN); + auto kernel_key_mkldnn = paddle::framework::TransOpKernelTypeToPtenKernelKey( + op_kernel_type_mkldnn); + ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32); + ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN); + ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN); +#endif + +#ifdef PADDLE_WITH_CUDA + paddle::framework::OpKernelType op_kernel_type_cudnn( + paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), + paddle::framework::DataLayout::kNCHW, + paddle::framework::LibraryType::kCUDNN); + auto kernel_key_cudnn = + paddle::framework::TransOpKernelTypeToPtenKernelKey(op_kernel_type_cudnn); + ASSERT_EQ(kernel_key_cudnn.dtype(), phi::DataType::FLOAT32); + ASSERT_EQ(kernel_key_cudnn.layout(), phi::DataLayout::NCHW); + ASSERT_EQ(kernel_key_cudnn.backend(), phi::Backend::GPUDNN); +#endif +} diff --git a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc index 9cefb24751e18..46e6c18bfb8e3 100644 --- a/paddle/fluid/inference/tensorrt/convert/softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/softmax_op.cc @@ -88,5 +88,5 @@ class SoftMaxOpConverter : public OpConverter { } // namespace inference } // namespace paddle -USE_OP(softmax); +USE_OP_ITSELF(softmax); REGISTER_TRT_OP_CONVERTER(softmax, SoftMaxOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc index b6fdcddf309d8..9cd5e81141598 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_softmax_op.cc @@ -45,4 +45,4 @@ TEST(SoftMaxOpConverter, main) { } // namespace inference } // namespace paddle -USE_OP(softmax); +USE_OP_ITSELF(softmax); diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu index 4f1f1ec651206..b5beb770909b5 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -98,8 +99,8 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel { const auto& labels_dims = labels->dims(); const int axis = logits_dims.size() - 1; - const int N = SizeToAxis(axis, logits_dims); - const int D = SizeFromAxis(axis, logits_dims); + const int N = phi::funcs::SizeToAxis(axis, logits_dims); + const int D = phi::funcs::SizeFromAxis(axis, logits_dims); Tensor logits_2d, softmax_2d, loss_2d; logits_2d.ShareDataWith(*logits).Resize({N, D}); @@ -220,8 +221,8 @@ class CSoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } const auto sofrmax_dims = softmax->dims(); const int axis = sofrmax_dims.size() - 1; - const int N = SizeToAxis(axis, sofrmax_dims); - const int D = SizeFromAxis(axis, sofrmax_dims); + const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims); + const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); Tensor logit_grad_2d; logit_grad_2d.ShareDataWith(*logit_grad).Resize({N, D}); diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h index c7cfd41fa2556..f5399e3215d58 100644 --- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/softmax_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fmha_ref.h b/paddle/fluid/operators/fused/fmha_ref.h index 31fff4b668d54..0202776757973 100644 --- a/paddle/fluid/operators/fused/fmha_ref.h +++ b/paddle/fluid/operators/fused/fmha_ref.h @@ -14,8 +14,8 @@ limitations under the License. */ #include "paddle/fluid/operators/dropout_impl.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/softmax_cudnn_op.cu.h" #include "paddle/fluid/operators/transpose_op.cu.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { namespace operators { @@ -123,11 +123,11 @@ class FMHARef { T, T>( dev_ctx_, ins, &outs, elewise_add_axis, AddFunctor()); - SoftmaxForwardCUDAKernelDriver(dev_ctx_, *src_mask_out_tensor, - softmax_axis, softmax_out_tensor); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx_, *src_mask_out_tensor, + softmax_axis, softmax_out_tensor); } else { - SoftmaxForwardCUDAKernelDriver(dev_ctx_, *qk_out_tensor, softmax_axis, - softmax_out_tensor); + phi::SoftmaxForwardCUDAKernelDriver(dev_ctx_, *qk_out_tensor, + softmax_axis, softmax_out_tensor); } transB = CblasNoTrans; @@ -251,9 +251,9 @@ class FMHARef { } if (src_mask_tensor != nullptr) { - SoftmaxBackwardCUDAKernelDriver(dev_ctx_, softmax_out_tensor, - *softmax_out_grad_tensor, softmax_axis, - src_mask_out_grad_tensor); + phi::SoftmaxBackwardCUDAKernelDriver( + dev_ctx_, softmax_out_tensor, *softmax_out_grad_tensor, softmax_axis, + src_mask_out_grad_tensor); // recall LaunchElementwiseCudaKernel fw: src_mask_out = qk_out + // src_mask @@ -272,9 +272,9 @@ class FMHARef { } } else { - SoftmaxBackwardCUDAKernelDriver(dev_ctx_, softmax_out_tensor, - *softmax_out_grad_tensor, softmax_axis, - qk_out_grad_tensor); + phi::SoftmaxBackwardCUDAKernelDriver(dev_ctx_, softmax_out_tensor, + *softmax_out_grad_tensor, + softmax_axis, qk_out_grad_tensor); } T* qk_out_grad_data = qk_out_grad_tensor->data(); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu index c6405f65ee3dd..a2e34d98461e0 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.cu +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -26,6 +26,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -246,8 +247,8 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { const auto& labels_dims = labels->dims(); const int axis = logits_dims.size() - 1; - const int N = SizeToAxis(axis, logits_dims); - const int D = SizeFromAxis(axis, logits_dims); + const int N = phi::funcs::SizeToAxis(axis, logits_dims); + const int D = phi::funcs::SizeFromAxis(axis, logits_dims); int blocks = NumBlocks(N); int threads = kNumCUDAThreads; @@ -401,8 +402,8 @@ class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { const auto sofrmax_dims = softmax->dims(); const int axis = sofrmax_dims.size() - 1; - const int N = SizeToAxis(axis, sofrmax_dims); - const int D = SizeFromAxis(axis, sofrmax_dims); + const int N = phi::funcs::SizeToAxis(axis, sofrmax_dims); + const int D = phi::funcs::SizeFromAxis(axis, sofrmax_dims); if (return_softmax) { framework::TensorCopy(*softmax, context.GetPlace(), diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h index fe0dab5d47d35..9261c84c8552c 100644 --- a/paddle/fluid/operators/margin_cross_entropy_op.h +++ b/paddle/fluid/operators/margin_cross_entropy_op.h @@ -22,7 +22,6 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/softmax_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc index fa2018178f44f..c855cb763a97b 100644 --- a/paddle/fluid/operators/math/softmax.cc +++ b/paddle/fluid/operators/math/softmax.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/phi/backends/cpu/cpu_context.h" namespace paddle { namespace operators { @@ -26,6 +27,13 @@ template class SoftmaxFunctor; template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu index 692a077f1050f..fd879e9e6ffe7 100644 --- a/paddle/fluid/operators/math/softmax.cu +++ b/paddle/fluid/operators/math/softmax.cu @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/operators/math/softmax_impl.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -139,6 +140,16 @@ template class SoftmaxGradFunctor; template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; + } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 2effcbf9f46dd..a0e50aa297851 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -12,8 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -70,7 +71,8 @@ class SoftmaxMKLDNNHandler out_grad->dims(), in_x_grad->dims())); auto dims = out_grad->dims(); // input and output share the same shape - const int axis = CanonicalAxis(ctx.Attr("axis"), dims.size()); + const int axis = + phi::funcs::CanonicalAxis(ctx.Attr("axis"), dims.size()); auto softmax_tz = phi::vectorize(dims); auto data_softmax_md = MKLDNNMemDesc( @@ -96,7 +98,8 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel { Tensor* output = ctx.Output("Out"); bool is_inplaced = input->IsSharedBufferWith(*output); - const int axis = CanonicalAxis(ctx.Attr("axis"), input->dims().size()); + const int axis = + phi::funcs::CanonicalAxis(ctx.Attr("axis"), input->dims().size()); SoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), input, output, axis); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index 9c5bad86278ed..2fdeecf89346f 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -31,7 +31,7 @@ USE_OP(elementwise_mul); USE_OP_DEVICE_KERNEL(elementwise_mul, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); -USE_OP(softmax); +USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); USE_OP(conv2d); USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc index 92c58ae0a7767..c776cf2a7c792 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_inplace.cc @@ -29,7 +29,7 @@ USE_OP_ITSELF(elementwise_add); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP(relu); USE_OP_DEVICE_KERNEL(relu, MKLDNN); -USE_OP(softmax); +USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, MKLDNN); namespace paddle { diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu b/paddle/fluid/operators/softmax_cudnn_op.cu deleted file mode 100644 index 72c2e97c1782e..0000000000000 --- a/paddle/fluid/operators/softmax_cudnn_op.cu +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/softmax_cudnn_op.cu.h" - -namespace paddle { -namespace operators { - -template -class SoftmaxCUDNNKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.template device_context(); - SoftmaxForwardCUDAKernelDriver(dev_ctx, *x, input_axis, out); - } -}; - -template -class SoftmaxGradCUDNNKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* out = ctx.Input("Out"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int input_axis = ctx.Attr("axis"); - auto& dev_ctx = ctx.template device_context(); - SoftmaxBackwardCUDAKernelDriver(dev_ctx, *out, *dout, input_axis, dx); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, - ops::SoftmaxCUDNNKernel, - ops::SoftmaxCUDNNKernel); -REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, - ops::SoftmaxGradCUDNNKernel, - ops::SoftmaxGradCUDNNKernel); -#else -REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace, - ops::SoftmaxCUDNNKernel, - ops::SoftmaxCUDNNKernel, - ops::SoftmaxCUDNNKernel); -REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace, - ops::SoftmaxGradCUDNNKernel, - ops::SoftmaxGradCUDNNKernel, - ops::SoftmaxGradCUDNNKernel); -#endif diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc index cb97a0bb27cb5..374992096605b 100644 --- a/paddle/fluid/operators/softmax_op.cc +++ b/paddle/fluid/operators/softmax_op.cc @@ -12,12 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_op.h" - #include #include #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #ifdef PADDLE_WITH_MKLDNN @@ -251,10 +250,3 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, ops::SoftmaxOpGradMaker, ops::SoftmaxInplaceInferer); REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL( - softmax, ops::SoftmaxKernel, - ops::SoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - softmax_grad, - ops::SoftmaxGradKernel, - ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.cu.cc b/paddle/fluid/operators/softmax_op.cu.cc deleted file mode 100644 index 19359b7eef512..0000000000000 --- a/paddle/fluid/operators/softmax_op.cu.cc +++ /dev/null @@ -1,27 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/softmax_op.h" -#include "paddle/fluid/platform/float16.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - softmax, ops::SoftmaxKernel, - ops::SoftmaxKernel, - ops::SoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - softmax_grad, ops::SoftmaxGradKernel, - ops::SoftmaxGradKernel, - ops::SoftmaxGradKernel); diff --git a/paddle/fluid/operators/softmax_op.h b/paddle/fluid/operators/softmax_op.h deleted file mode 100644 index 497bbb06dab5f..0000000000000 --- a/paddle/fluid/operators/softmax_op.h +++ /dev/null @@ -1,114 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/softmax.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline int SizeToAxis(const int axis, DDim dims) { - int size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline int SizeFromAxis(const int axis, DDim dims) { - int size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -static inline int SizeOutAxis(const int axis, DDim dims) { - int size = 1; - for (int i = axis + 1; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -class SoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = X->dims()[axis]; - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - if (Out->numel() == 0) { - return; - } - - const int n = SizeToAxis(axis, X->dims()); - const int d = SizeFromAxis(axis, X->dims()); - Tensor X_2d, Out_2d; - X_2d.ShareDataWith(*X).Resize({n, d}); - Out_2d.ShareDataWith(*Out).Resize({n, d}); - math::SoftmaxFunctor()( - context.template device_context(), axis_dim, &X_2d, - &Out_2d); - } -}; - -template -class SoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = dX->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = dX->dims()[axis]; - - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - if (dX->numel() == 0) { - return; - } - - const int n = SizeToAxis(axis, dX->dims()); - const int d = SizeFromAxis(axis, dX->dims()); - Tensor dX_2d, Out_2d, dOut_2d; - dX_2d.ShareDataWith(*dX).Resize({n, d}); - Out_2d.ShareDataWith(*Out).Resize({n, d}); - dOut_2d.ShareDataWith(*dOut).Resize({n, d}); - - math::SoftmaxGradFunctor()( - context.template device_context(), axis_dim, &Out_2d, - &dOut_2d, &dX_2d); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/softmax_op_npu.cc b/paddle/fluid/operators/softmax_op_npu.cc index 07e74354bfd7c..152c8d0a883b0 100644 --- a/paddle/fluid/operators/softmax_op_npu.cc +++ b/paddle/fluid/operators/softmax_op_npu.cc @@ -12,8 +12,9 @@ limitations under the License. */ #include #include -#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -51,7 +52,7 @@ class SoftmaxGradNPUKernel : public framework::OpKernel { auto dims = dX->dims(); const int rank = dims.size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); int64_t first_dim = 1; int64_t sec_dim = 1; for (int i = 0; i < axis; i++) { diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index defda1a3b04a6..3bc55fafd81e1 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -29,7 +29,7 @@ limitations under the License. */ namespace f = paddle::framework; namespace p = paddle::platform; -USE_OP(softmax); +USE_OP_ITSELF(softmax); USE_OP_DEVICE_KERNEL(softmax, NPU); template diff --git a/paddle/fluid/operators/softmax_op_xpu.cc b/paddle/fluid/operators/softmax_op_xpu.cc index a29804e505f66..1ed13c8bd1bae 100644 --- a/paddle/fluid/operators/softmax_op_xpu.cc +++ b/paddle/fluid/operators/softmax_op_xpu.cc @@ -11,8 +11,8 @@ limitations under the License. */ #ifdef PADDLE_WITH_XPU -#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -29,7 +29,7 @@ class SoftmaxXPUKernel : public framework::OpKernel { auto* x = context.Input("X"); auto* out = context.Output("Out"); const int rank = x->dims().size(); - int axis = CanonicalAxis(context.Attr("axis"), rank); + int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); // allocate memory on device. out->mutable_data(context.GetPlace()); @@ -88,7 +88,7 @@ class SoftmaxGradXPUKernel : public framework::OpKernel { auto* dout = context.Input(framework::GradVarName("Out")); auto* dx = context.Output(framework::GradVarName("X")); const int rank = dx->dims().size(); - int axis = CanonicalAxis(context.Attr("axis"), rank); + int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); // allocate memory on device. dx->mutable_data(context.GetPlace()); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index cba779d0a77d0..6f0881e9fc98f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -153,7 +153,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(Logits).")); - axis = CanonicalAxis(axis, logits_rank); + axis = phi::funcs::CanonicalAxis(axis, logits_rank); for (int i = 0; i < logits_rank; i++) { if (i != axis) { if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) { @@ -250,7 +250,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { "Attr(axis) value should be in range [-R, R-1], " "R is the rank of Input(Logits).")); - axis = CanonicalAxis(axis, softmax_rank); + axis = phi::funcs::CanonicalAxis(axis, softmax_rank); for (int i = 0; i < softmax_rank; i++) { if (i != axis) { if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) { diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 2bbacef596e59..fd035df768dbd 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -17,12 +17,12 @@ namespace cub = hipcub; #endif #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/operators/softmax_cudnn_op.cu.h" #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" #include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" namespace paddle { namespace operators { @@ -236,7 +236,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src, max_value[i] = (max_value[i] > valmax) ? max_value[i] : valmax; } } - WarpReduceMax(max_value); + phi::WarpReduceMax(max_value); // compute sum: s_{i} = sum_{j}{ exp(src_{i,j} - maxvalue_{i} } AccT sum[kBatchSize]; @@ -276,7 +276,7 @@ __global__ void WarpSoftmaxForward(T* loss, T* softmax, const T* src, } } } - WarpReduceSum(sum); + phi::WarpReduceSum(sum); // write data #pragma unroll @@ -566,7 +566,7 @@ __global__ void CrossEntropySoftLabel(T* loss, T* softmaxwrt, const T* softmax, } } } - WarpReduceSum(sum); + phi::WarpReduceSum(sum); __syncthreads(); __shared__ T sumshare[kWarpPerBatch][kBatchPerBlock][kBatchSize]; @@ -674,7 +674,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, : static_cast(valmax); } } - WarpReduceMax(max_value); + phi::WarpReduceMax(max_value); // compute sum AccT sum[kBatchSize]{0.0}; @@ -694,7 +694,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, } } } - WarpReduceSum(sum); + phi::WarpReduceSum(sum); // log_softmax and loss AccT sumloss[kBatchSize]{0.0}; @@ -737,7 +737,7 @@ __global__ void WarpSoftmaxForwardSoftLabel(T* loss, T* softmax, const T* src, } // loss - WarpReduceSum(sumloss); + phi::WarpReduceSum(sumloss); for (int i = 0; i < kBatchSize; i++) { if (i >= local_batches) break; @@ -950,11 +950,12 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { Tensor* loss = context.Output("Loss"); const int rank = softmax->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = + phi::funcs::CanonicalAxis(context.Attr("axis"), rank); const int axis_dim = softmax->dims()[axis]; - const int n = SizeToAxis(axis, softmax->dims()); - const int d = SizeFromAxis(axis, softmax->dims()); + const int n = phi::funcs::SizeToAxis(axis, softmax->dims()); + const int d = phi::funcs::SizeFromAxis(axis, softmax->dims()); auto* softmax_out_data = softmax_out->template mutable_data(context.GetPlace()); @@ -1035,11 +1036,11 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { Tensor* loss = context.Output("Loss"); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); int axis_dim = logits->dims()[axis]; - const int64_t n = SizeToAxis(axis, logits->dims()); - const int64_t d = SizeFromAxis(axis, logits->dims()); + const int64_t n = phi::funcs::SizeToAxis(axis, logits->dims()); + const int64_t d = phi::funcs::SizeFromAxis(axis, logits->dims()); auto* softmax_data = softmax->template mutable_data(context.GetPlace()); auto* loss_data = loss->template mutable_data(context.GetPlace()); @@ -1118,11 +1119,11 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { T* logit_grad_data = logit_grad->template data(); const int rank = logit_grad->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); int axis_dim = logit_grad->dims()[axis]; - const int64_t n = SizeToAxis(axis, logit_grad->dims()); - const int64_t d = SizeFromAxis(axis, logit_grad->dims()); + const int64_t n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); + const int64_t d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); const int64_t remain = d / axis_dim; #ifdef __HIPCC__ diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index a7f88dd0ec38e..4b875cbf5841f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/cross_entropy.h" #include "paddle/fluid/operators/math/softmax.h" -#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" namespace paddle { namespace operators { @@ -84,7 +84,8 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { Tensor* softmax_out = context.Output("Softmax"); Tensor* loss = context.Output("Loss"); const int rank = softmax->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = + phi::funcs::CanonicalAxis(context.Attr("axis"), rank); int axis_dim = softmax->dims()[axis]; PADDLE_ENFORCE_GT( @@ -97,7 +98,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax_out->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - const int n = SizeToAxis(axis, softmax->dims()); + const int n = phi::funcs::SizeToAxis(axis, softmax->dims()); PADDLE_ENFORCE_GT( n, 0, platform::errors::InvalidArgument( @@ -105,7 +106,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { "SizeToAxis of softmax is %d.", n)); - const int d = SizeFromAxis(axis, softmax->dims()); + const int d = phi::funcs::SizeFromAxis(axis, softmax->dims()); Tensor softmax_2d, labels_2d, loss_2d, softmax_out_2d; softmax_2d.ShareDataWith(*softmax).Resize({n, d}); @@ -133,7 +134,7 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { Tensor* loss = context.Output("Loss"); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); int axis_dim = logits->dims()[axis]; PADDLE_ENFORCE_GT( axis_dim, 0, @@ -145,14 +146,14 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - const int n = SizeToAxis(axis, logits->dims()); + const int n = phi::funcs::SizeToAxis(axis, logits->dims()); PADDLE_ENFORCE_GT( n, 0, platform::errors::InvalidArgument( "The size of axis should be larger than 0, but received " "SizeToAxis of logits is %d.", n)); - const int d = SizeFromAxis(axis, logits->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); Tensor logits_2d, softmax_2d, labels_2d, loss_2d; logits_2d.ShareDataWith(*logits).Resize({n, d}); softmax_2d.ShareDataWith(*softmax).Resize({n, d}); @@ -192,7 +193,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { auto ignore_index = context.Attr("ignore_index"); const int rank = logit_grad->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); int axis_dim = logit_grad->dims()[axis]; PADDLE_ENFORCE_GT( axis_dim, 0, @@ -201,14 +202,14 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { "axis dimention is %d.", axis_dim)); - const int n = SizeToAxis(axis, logit_grad->dims()); + const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); PADDLE_ENFORCE_GT( n, 0, platform::errors::InvalidArgument( "The size of axis should be larger than 0, but received " "SizeToAxis of logit_grad is %d.", n)); - const int d = SizeFromAxis(axis, logit_grad->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); Tensor logit_grad_2d, labels_2d, out_grad_2d; logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d}); labels_2d.ShareDataWith(labels).Resize({n, labels.numel() / n}); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index a5576ab5af3fd..1f1fbea090c13 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" + #include #include #include "paddle/fluid/operators/math/cross_entropy.h" -#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/operators/math/softmax.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { @@ -40,15 +41,16 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { "the npu kernel of softmax_with_cross_entropy.")); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); - const int n = SizeToAxis(axis, logits->dims()); - const int d = SizeFromAxis(axis, logits->dims()); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); + const int n = phi::funcs::SizeToAxis(axis, logits->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); PADDLE_ENFORCE_EQ( labels->numel(), n, platform::errors::Unimplemented( - "The size of labels should be equal to SizeToAxis of logits," - "but got size of labels is %d and SizeToAxis is %d.", + "The size of labels should be equal to phi::funcs::SizeToAxis of " + "logits," + "but got size of labels is %d and phi::funcs::SizeToAxis is %d.", labels->numel(), n)); loss->mutable_data(ctx.GetPlace()); @@ -97,9 +99,9 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { logits_grad->mutable_data(ctx.GetPlace()); const int rank = logits_grad->dims().size(); - const int axis = CanonicalAxis(ctx.Attr("axis"), rank); - const int n = SizeToAxis(axis, logits_grad->dims()); - const int d = SizeFromAxis(axis, logits_grad->dims()); + const int axis = phi::funcs::CanonicalAxis(ctx.Attr("axis"), rank); + const int n = phi::funcs::SizeToAxis(axis, logits_grad->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logits_grad->dims()); Tensor logits_grad_2d, loss_grad_1d, backprop_2d; diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc index 650e488c5e10b..d9149b85c6a0f 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_xpu.cc @@ -38,13 +38,13 @@ class SoftmaxWithCrossEntropyXPUKernel : public framework::OpKernel { Tensor* softmax = context.Output("Softmax"); Tensor* loss = context.Output("Loss"); const int rank = logits->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument( "axis should == rank - 1")); softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - const int n = SizeToAxis(axis, logits->dims()); - const int d = SizeFromAxis(axis, logits->dims()); + const int n = phi::funcs::SizeToAxis(axis, logits->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logits->dims()); std::vector logits_dims = phi::vectorize(logits->dims()); const bool soft_label = context.Attr("soft_label"); @@ -122,11 +122,11 @@ class SoftmaxWithCrossEntropyGradXPUKernel : public framework::OpKernel { auto ignore_index = context.Attr("ignore_index"); const int rank = logit_grad->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); + const int axis = phi::funcs::CanonicalAxis(context.Attr("axis"), rank); PADDLE_ENFORCE_EQ(axis, rank - 1, platform::errors::InvalidArgument( "axis should == rank - 1")); - const int n = SizeToAxis(axis, logit_grad->dims()); - const int d = SizeFromAxis(axis, logit_grad->dims()); + const int n = phi::funcs::SizeToAxis(axis, logit_grad->dims()); + const int d = phi::funcs::SizeFromAxis(axis, logit_grad->dims()); auto& dev_ctx = context.template device_context(); diff --git a/paddle/fluid/operators/test_common_infer_shape_functions.cc b/paddle/fluid/operators/test_common_infer_shape_functions.cc index f04ba72a1e181..a7c7e33f58af6 100644 --- a/paddle/fluid/operators/test_common_infer_shape_functions.cc +++ b/paddle/fluid/operators/test_common_infer_shape_functions.cc @@ -22,7 +22,7 @@ limitations under the License. */ USE_OP(relu); USE_OP_ITSELF(elementwise_add); -USE_OP(softmax); +USE_OP_ITSELF(softmax); namespace paddle { namespace operators { diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h index 5fa80d3a57741..603ce0817c4eb 100644 --- a/paddle/phi/backends/gpu/gpu_context.h +++ b/paddle/phi/backends/gpu/gpu_context.h @@ -220,4 +220,11 @@ class GPUContext : public DeviceContext { std::unique_ptr impl_; }; +// Note: In order to register the kernel of CUDNN, GPUDNNContext is required. +// Currently, CUDNN kernel directly uses GPUContext. But if the kernel function +// has the same name, this will lead to duplicate instantiations of GPU kernel +// and GPUDNN kernel function, so if we using GPUDNNContext = GPUContext, we +// must use different function name for cudnn kernel +using GPUDNNContext = GPUContext; + } // namespace phi diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 1d3e4369c6948..4b7bf65be39cb 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -50,7 +50,7 @@ enum class Backend : uint8_t { // the third library backend MKLDNN, - CUDNN, + GPUDNN, // cuDNN and hipDNN // end of backend types NUM_BACKENDS, @@ -112,8 +112,8 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) { case Backend::MKLDNN: os << "MKLDNN"; break; - case Backend::CUDNN: - os << "CUDNN"; + case Backend::GPUDNN: + os << "GPUDNN"; break; default: { size_t device_type_id_ = static_cast(backend) - @@ -145,8 +145,8 @@ inline Backend StringToBackend(const char* backend_cstr) { return Backend::NPU; } else if (s == std::string("MKLDNN")) { return Backend::MKLDNN; - } else if (s == std::string("CUDNN")) { - return Backend::CUDNN; + } else if (s == std::string("GPUDNN")) { + return Backend::GPUDNN; } else { return static_cast(static_cast(Backend::NUM_BACKENDS) + phi::GetOrRegisterGlobalDeviceTypeId(s)); diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h index 1cdcdef2c12ee..6ed9c88d70510 100644 --- a/paddle/phi/common/float16.h +++ b/paddle/phi/common/float16.h @@ -988,6 +988,18 @@ inline std::ostream& operator<<(std::ostream& os, const float16& a) { return os; } +template +class MPTypeTrait { + public: + using Type = T; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + } // namespace dtype } // namespace phi diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index a5b7b869b948d..f7dab1d34c980 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -58,7 +58,7 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) { return phi::CPUPlace(); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - case phi::Backend::CUDNN: + case phi::Backend::GPUDNN: return phi::GPUPlace( set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0); #endif diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 4a79f191c23b3..f27adf1de149b 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -15,8 +15,15 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function i set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) -# auto build kernel targets by cmake -register_kernels(DEPS ${COMMON_KERNEL_DEPS}) +# NOTE: Some kernels depend on some targets that are not commonly used. +# These targets are not suitable for common dependencies. +# In this case, you need to manually generate them here. +set(MANUAL_BUILD_KERNELS softmax_kernel softmax_grad_kernel) +kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) +kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) + +# auto parse and build kernel targets by cmake +register_kernels(EXCLUDES ${MANUAL_BUILD_KERNELS} DEPS ${COMMON_KERNEL_DEPS}) # phi sparse kernels add_subdirectory(sparse) diff --git a/paddle/phi/kernels/cpu/softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc new file mode 100644 index 0000000000000..ef90f9c6762d6 --- /dev/null +++ b/paddle/phi/kernels/cpu/softmax_grad_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h" + +PD_REGISTER_KERNEL( + softmax_grad, CPU, ALL_LAYOUT, phi::SoftmaxGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/softmax_kernel.cc b/paddle/phi/kernels/cpu/softmax_kernel.cc new file mode 100644 index 0000000000000..537b4326681a1 --- /dev/null +++ b/paddle/phi/kernels/cpu/softmax_kernel.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/softmax_kernel_impl.h" + +PD_REGISTER_KERNEL( + softmax, CPU, ALL_LAYOUT, phi::SoftmaxRawKernel, float, double) {} diff --git a/paddle/phi/kernels/funcs/axis_utils.h b/paddle/phi/kernels/funcs/axis_utils.h new file mode 100644 index 0000000000000..02a89471889a7 --- /dev/null +++ b/paddle/phi/kernels/funcs/axis_utils.h @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/ddim.h" + +namespace phi { +namespace funcs { + +static inline int CanonicalAxis(const int axis, const int rank) { + if (axis < 0) { + return axis + rank; + } + return axis; +} + +static inline int SizeToAxis(const int axis, DDim dims) { + int size = 1; + for (int i = 0; i < axis; i++) { + size *= dims[i]; + } + return size; +} + +static inline int SizeFromAxis(const int axis, DDim dims) { + int size = 1; + for (int i = axis; i < dims.size(); i++) { + size *= dims[i]; + } + return size; +} + +static inline int SizeOutAxis(const int axis, DDim dims) { + int size = 1; + for (int i = axis + 1; i < dims.size(); i++) { + size *= dims[i]; + } + return size; +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/funcs/concat_funcs.h b/paddle/phi/kernels/funcs/concat_funcs.h index 32237e2cc2366..70e3545b981fa 100644 --- a/paddle/phi/kernels/funcs/concat_funcs.h +++ b/paddle/phi/kernels/funcs/concat_funcs.h @@ -92,4 +92,4 @@ static inline phi::DDim ComputeAndCheckShape( } } // namespace funcs -} // namespace phi +} // namespace phi diff --git a/paddle/phi/kernels/funcs/eigen/elementwise.cu b/paddle/phi/kernels/funcs/eigen/elementwise.cu index 96d2ddba03c28..3855ba8ccf945 100644 --- a/paddle/phi/kernels/funcs/eigen/elementwise.cu +++ b/paddle/phi/kernels/funcs/eigen/elementwise.cu @@ -55,5 +55,5 @@ struct EigenSub { template struct EigenSub; -} // namespace fucns +} // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/gpu/softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu new file mode 100644 index 0000000000000..aa496d3cd391b --- /dev/null +++ b/paddle/phi/kernels/gpu/softmax_grad_kernel.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/softmax_grad_kernel_impl.h" + +PD_REGISTER_KERNEL(softmax_grad, + GPU, + ALL_LAYOUT, + phi::SoftmaxGradKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/softmax_kernel.cu b/paddle/phi/kernels/gpu/softmax_kernel.cu new file mode 100644 index 0000000000000..32efb9b776419 --- /dev/null +++ b/paddle/phi/kernels/gpu/softmax_kernel.cu @@ -0,0 +1,28 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/impl/softmax_kernel_impl.h" + +PD_REGISTER_KERNEL(softmax, + GPU, + ALL_LAYOUT, + phi::SoftmaxRawKernel, + float, + double, + phi::dtype::float16) {} diff --git a/paddle/fluid/operators/softmax_cudnn_op.cu.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h similarity index 63% rename from paddle/fluid/operators/softmax_cudnn_op.cu.h rename to paddle/phi/kernels/gpudnn/softmax_gpudnn.h index dc5166f4f994f..45798b88bb58a 100644 --- a/paddle/fluid/operators/softmax_cudnn_op.cu.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -14,18 +14,20 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" -#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/common/float16.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/primitive/kernel_primitives.h" + +// See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_dnn.h" -namespace paddle { -namespace operators { +namespace phi { -using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; -using DataLayout = platform::DataLayout; -using Tensor = framework::Tensor; +using ScopedTensorDescriptor = paddle::platform::ScopedTensorDescriptor; +using GPUDNNDataLayout = paddle::platform::DataLayout; // Vectorization trait 4 * sizeof(T) template @@ -41,7 +43,7 @@ class VecT4 { using Type = int4; }; template <> -class VecT4 { +class VecT4 { public: using Type = int2; }; @@ -60,7 +62,7 @@ class VecT2 { using Type = int2; }; template <> -class VecT2 { +class VecT2 { public: using Type = int; }; @@ -77,7 +79,8 @@ __device__ __forceinline__ void WarpReduceSum(T* sum) { for (int offset = WarpSize / 2; offset > 0; offset /= 2) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { - T sum_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + T sum_val = + paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = sum[i] + sum_val; } } @@ -89,14 +92,13 @@ __device__ __forceinline__ void WarpReduceMax(T* sum) { for (int offset = WarpSize / 2; offset > 0; offset /= 2) { #pragma unroll for (int i = 0; i < BatchSize; ++i) { - T max_val = platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + T max_val = + paddle::platform::CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); sum[i] = max(sum[i], max_val); } } } -namespace kps = paddle::operators::kernel_primitives; - template struct ReduceMaxFunctor { inline Ty initial() { return -std::numeric_limits::infinity(); } @@ -248,10 +250,15 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle api to compute max (sum) in one warp. */ -template -__global__ void WarpSoftmaxForward(T* softmax, const T* src, - const int batch_size, const int stride, +__global__ void WarpSoftmaxForward(T* softmax, + const T* src, + const int batch_size, + const int stride, const int element_count) { constexpr int kDimCeil = 1 << Log2Elements; constexpr int kWarpSize = (kDimCeil < 32) ? kDimCeil : 32; @@ -302,9 +309,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src, } // compute max - kps::Reduce, - kMode::kLocalMode>(&max[0], &srcdata[0][0][0], - ReduceMaxFunctor(), true); + kps::Reduce, + kMode::kLocalMode>( + &max[0], &srcdata[0][0][0], ReduceMaxFunctor(), true); WarpReduceMax(max); // compute sum @@ -313,9 +324,13 @@ __global__ void WarpSoftmaxForward(T* softmax, const T* src, kps::ElementwiseUnary>( &srcdata[i][0][0], &srcdata[i][0][0], ExpSubFunctor(max[i])); } - kps::Reduce, - kMode::kLocalMode>(&sum[0], &srcdata[0][0][0], - kps::AddFunctor(), true); + kps::Reduce, + kMode::kLocalMode>( + &sum[0], &srcdata[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); // write data to global memory @@ -340,10 +355,16 @@ One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize). For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle api to compute max (sum) in one warp. */ -template -__global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src, - int batch_size, int stride, +__global__ void WarpSoftmaxBackward(T* dst, + const T* grad, + const T* src, + int batch_size, + int stride, int element_count) { constexpr int kVSize = sizeof(VecT) / sizeof(T); constexpr int kDimCeil = 1 << Log2Elements; @@ -403,7 +424,11 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src, AccT* srcptr = reinterpret_cast(&src_tmp[0][0][0]); kps::ElementwiseBinary>( &sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor()); - kps::Reduce, + kps::Reduce, kps::details::ReduceMode::kLocalMode>( &sum[0], &sum_tmp[0][0][0], kps::AddFunctor(), true); WarpReduceSum(sum); @@ -429,7 +454,10 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src, #define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, AccT) \ case Log2Elements: \ - WarpSoftmaxForward<<>>( \ dst, src, batch_size, stride, element_count); \ break; @@ -438,12 +466,16 @@ __global__ void WarpSoftmaxBackward(T* dst, const T* grad, const T* src, Wrapper of softmax formward with template instantiation on size of input. */ template -void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads, - const platform::CUDADeviceContext& dev_ctx, - T* dst, const T* src, const int batch_size, - const int stride, const int element_count, +void SwitchWarpSoftmaxForward(const int blocks, + const dim3 threads, + const GPUContext& dev_ctx, + T* dst, + const T* src, + const int batch_size, + const int stride, + const int element_count, int Log2Elements) { - using AccT = typename details::MPTypeTrait::Type; + using AccT = typename phi::dtype::MPTypeTrait::Type; switch (Log2Elements) { SOFTMAX_WARP_FORWARD_CASE(0, AccT); SOFTMAX_WARP_FORWARD_CASE(1, AccT); @@ -462,7 +494,10 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads, #define SOFTMAX_WARP_BACKWARD_CASE(Log2Elements, AccT) \ case Log2Elements: \ - WarpSoftmaxBackward<<>>( \ dst, grad, src, batch_size, stride, element_count); \ break; @@ -471,12 +506,17 @@ void SwitchWarpSoftmaxForward(const int blocks, const dim3 threads, Wrapper of softmax backward with template instantiation on size of input. */ template -void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads, - const platform::CUDADeviceContext& dev_ctx, - T* dst, const T* grad, const T* src, - const int batch_size, const int stride, - const int element_count, int Log2Elements) { - using AccT = typename details::MPTypeTrait::Type; +void SwitchWarpSoftmaxBackward(const int blocks, + const dim3 threads, + const GPUContext& dev_ctx, + T* dst, + const T* grad, + const T* src, + const int batch_size, + const int stride, + const int element_count, + int Log2Elements) { + using AccT = typename phi::dtype::MPTypeTrait::Type; switch (Log2Elements) { SOFTMAX_WARP_BACKWARD_CASE(0, AccT); SOFTMAX_WARP_BACKWARD_CASE(1, AccT); @@ -501,12 +541,12 @@ void SwitchWarpSoftmaxBackward(const int blocks, const dim3 threads, * Better performence when axis != -1 */ -static void GetGridDim(int high_dim, int mid_dim, int low_dim, - const dim3& block, dim3* grid) { - int device_id = paddle::platform::GetCurrentDeviceId(); - int max_mp = paddle::platform::GetGPUMultiProcessors(device_id); +static void GetGridDim( + int high_dim, int mid_dim, int low_dim, const dim3& block, dim3* grid) { + int device_id = phi::backends::gpu::GetCurrentDeviceId(); + int max_mp = phi::backends::gpu::GetGPUMultiProcessors(device_id); int max_threads_per_mp = - paddle::platform::GetGPUMaxThreadsPerMultiProcessor(device_id); + phi::backends::gpu::GetGPUMaxThreadsPerMultiProcessor(device_id); int max_threads = max_threads_per_mp * max_mp; int num_threads = block.x * block.y; int max_num_blocks = max_threads / num_threads; @@ -532,16 +572,17 @@ static void GetBlockDim(int mid_dim, int low_dim, dim3* block) { block->x = std::min(block_x, static_cast(max_num_threads / block->y)); } -static void GetLaunchConfig(int high_dim, int mid_dim, int low_dim, dim3* grid, - dim3* block) { +static void GetLaunchConfig( + int high_dim, int mid_dim, int low_dim, dim3* grid, dim3* block) { GetBlockDim(mid_dim, low_dim, block); GetGridDim(high_dim, mid_dim, low_dim, *block, grid); } -template class Functor> -__global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim, - int mid_dim, int low_dim) { +__global__ void NormalSoftmaxForward( + T* output, const T* input, int high_dim, int mid_dim, int low_dim) { using kMode = kps::details::ReduceMode; const int high_stride = mid_dim * low_dim; const int mid_stride = low_dim; @@ -584,11 +625,15 @@ __global__ void NormalSoftmaxForward(T* output, const T* input, int high_dim, } } -template class Functor> -__global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad, - const T* output, int high_dim, - int mid_dim, int low_dim) { +__global__ void NormalSoftmaxBackward(T* input_grad, + const T* output_grad, + const T* output, + int high_dim, + int mid_dim, + int low_dim) { using kMode = kps::details::ReduceMode; const int high_stride = mid_dim * low_dim; const int mid_stride = low_dim; @@ -622,58 +667,79 @@ __global__ void NormalSoftmaxBackward(T* input_grad, const T* output_grad, } template -void LaunchNormalSoftmaxForward(const platform::CUDADeviceContext& dev_ctx, - T* output_data, const T* input_data, - int high_dim, int mid_dim, int low_dim) { - using AccT = typename details::MPTypeTrait::Type; +void LaunchNormalSoftmaxForward(const GPUContext& dev_ctx, + T* output_data, + const T* input_data, + int high_dim, + int mid_dim, + int low_dim) { + using AccT = typename phi::dtype::MPTypeTrait::Type; dim3 grid, block; GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block); if (LogMode) { NormalSoftmaxForward< - T, AccT, + T, + AccT, LogSoftmaxForwardFunctor><<>>( output_data, input_data, high_dim, mid_dim, low_dim); } else { NormalSoftmaxForward< - T, AccT, SoftmaxForwardFunctor><<>>( + T, + AccT, + SoftmaxForwardFunctor><<>>( output_data, input_data, high_dim, mid_dim, low_dim); } } template -void LaunchNormalSoftmaxBackward(const platform::CUDADeviceContext& dev_ctx, - T* input_grad_data, const T* output_grad_data, - const T* output_data, int high_dim, - int mid_dim, int low_dim) { - using AccT = typename details::MPTypeTrait::Type; +void LaunchNormalSoftmaxBackward(const GPUContext& dev_ctx, + T* input_grad_data, + const T* output_grad_data, + const T* output_data, + int high_dim, + int mid_dim, + int low_dim) { + using AccT = typename phi::dtype::MPTypeTrait::Type; dim3 grid, block; GetLaunchConfig(high_dim, mid_dim, low_dim, &grid, &block); if (LogMode) { NormalSoftmaxBackward< - T, AccT, + T, + AccT, LogSoftmaxBackwardFunctor><<>>( - input_grad_data, output_grad_data, output_data, high_dim, mid_dim, + input_grad_data, + output_grad_data, + output_data, + high_dim, + mid_dim, low_dim); } else { NormalSoftmaxBackward< - T, AccT, SoftmaxBackwardFunctor><<>>( - input_grad_data, output_grad_data, output_data, high_dim, mid_dim, + T, + AccT, + SoftmaxBackwardFunctor><<>>( + input_grad_data, + output_grad_data, + output_data, + high_dim, + mid_dim, low_dim); } } template -void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, - const Tensor& x, const int input_axis, - Tensor* out) { +void SoftmaxForwardCUDAKernelDriver(const GPUContext& dev_ctx, + const DenseTensor& x, + const int input_axis, + DenseTensor* out) { auto* out_data = out->data(); auto dims = x.dims(); const int rank = dims.size(); - const int axis = CanonicalAxis(input_axis, rank); + const int axis = phi::funcs::CanonicalAxis(input_axis, rank); const int dim = dims[axis]; - const int N = SizeToAxis(axis, dims); - const int D = SizeOutAxis(axis, dims); + const int N = phi::funcs::SizeToAxis(axis, dims); + const int D = phi::funcs::SizeOutAxis(axis, dims); constexpr int max_dim = 512; constexpr int warps_per_block = 4; @@ -697,25 +763,43 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, using T2 = typename VecT2::Type; if (dim % 4 == 0) { - SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, - out_data, x.data(), N, dim, - dim, kDimLog2); + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + kDimLog2); } else if (dim % 2 == 0) { - SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, - out_data, x.data(), N, dim, - dim, kDimLog2); + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + kDimLog2); } else { - SwitchWarpSoftmaxForward(blocks, threads, dev_ctx, - out_data, x.data(), N, dim, - dim, kDimLog2); + SwitchWarpSoftmaxForward(blocks, + threads, + dev_ctx, + out_data, + x.data(), + N, + dim, + dim, + kDimLog2); } } else if (D > 1) { - LaunchNormalSoftmaxForward(dev_ctx, out_data, x.data(), N, - dim, D); + LaunchNormalSoftmaxForward( + dev_ctx, out_data, x.data(), N, dim, D); } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - DataLayout layout = DataLayout::kNCHW; + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #else @@ -728,46 +812,74 @@ void SoftmaxForwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( - handle, platform::CudnnDataType::kOne(), desc_, x.data(), - platform::CudnnDataType::kZero(), desc_, out_data, - MIOPEN_SOFTMAX_LOG, mode)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSoftmaxForward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc_, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + out_data, + MIOPEN_SOFTMAX_LOG, + mode)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2( - handle, platform::CudnnDataType::kOne(), desc_, x.data(), - platform::CudnnDataType::kZero(), desc_, out_data, - MIOPEN_SOFTMAX_ACCURATE, mode)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSoftmaxForward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc_, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + out_data, + MIOPEN_SOFTMAX_ACCURATE, + mode)); } #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( - handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), - desc_, x.data(), platform::CudnnDataType::kZero(), desc_, + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( + handle, + CUDNN_SOFTMAX_LOG, + mode, + paddle::platform::CudnnDataType::kOne(), + desc_, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, out_data)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward( - handle, CUDNN_SOFTMAX_ACCURATE, mode, - platform::CudnnDataType::kOne(), desc_, x.data(), - platform::CudnnDataType::kZero(), desc_, out_data)); + PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( + handle, + CUDNN_SOFTMAX_ACCURATE, + mode, + paddle::platform::CudnnDataType::kOne(), + desc_, + x.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + out_data)); } #endif } } template -void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, - const Tensor& out, const Tensor& dout, - const int input_axis, Tensor* dx) { +void SoftmaxBackwardCUDAKernelDriver(const GPUContext& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + const int input_axis, + DenseTensor* dx) { auto* dx_data = dx->data(); auto dims = out.dims(); const int rank = dims.size(); - const int axis = CanonicalAxis(input_axis, rank); + const int axis = phi::funcs::CanonicalAxis(input_axis, rank); const int dim = dims[axis]; - const int N = SizeToAxis(axis, dims); - const int D = SizeOutAxis(axis, dims); + const int N = phi::funcs::SizeToAxis(axis, dims); + const int D = phi::funcs::SizeOutAxis(axis, dims); constexpr int max_dim = 512; constexpr int warps_per_block = 4; @@ -788,25 +900,46 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, using T4 = typename VecT4::Type; using T2 = typename VecT2::Type; if (dim % 4 == 0) { - SwitchWarpSoftmaxBackward( - blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, - dim, dim, kDimLog2); + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + kDimLog2); } else if (dim % 2 == 0) { - SwitchWarpSoftmaxBackward( - blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, - dim, dim, kDimLog2); + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + kDimLog2); } else { - SwitchWarpSoftmaxBackward( - blocks, threads, dev_ctx, dx_data, dout.data(), out.data(), N, - dim, dim, kDimLog2); + SwitchWarpSoftmaxBackward(blocks, + threads, + dev_ctx, + dx_data, + dout.data(), + out.data(), + N, + dim, + dim, + kDimLog2); } } else if (D > 1) { - LaunchNormalSoftmaxBackward(dev_ctx, dx_data, dout.data(), - out.data(), N, dim, D); + LaunchNormalSoftmaxBackward( + dev_ctx, dx_data, dout.data(), out.data(), N, dim, D); } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; - DataLayout layout = DataLayout::kNCHW; + GPUDNNDataLayout layout = GPUDNNDataLayout::kNCHW; #ifdef PADDLE_WITH_HIP miopenTensorDescriptor_t desc_ = desc.descriptor(layout, tensor_dims); #else @@ -819,33 +952,68 @@ void SoftmaxBackwardCUDAKernelDriver(const platform::CUDADeviceContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( - handle, platform::CudnnDataType::kOne(), desc_, out.data(), - desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, - dx_data, MIOPEN_SOFTMAX_LOG, mode)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSoftmaxBackward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc_, + out.data(), + desc_, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + dx_data, + MIOPEN_SOFTMAX_LOG, + mode)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2( - handle, platform::CudnnDataType::kOne(), desc_, out.data(), - desc_, dout.data(), platform::CudnnDataType::kZero(), desc_, - dx_data, MIOPEN_SOFTMAX_ACCURATE, mode)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::miopenSoftmaxBackward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc_, + out.data(), + desc_, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + dx_data, + MIOPEN_SOFTMAX_ACCURATE, + mode)); } #else auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; if (LogMode) { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( - handle, CUDNN_SOFTMAX_LOG, mode, platform::CudnnDataType::kOne(), - desc_, out.data(), desc_, dout.data(), - platform::CudnnDataType::kZero(), desc_, dx_data)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSoftmaxBackward( + handle, + CUDNN_SOFTMAX_LOG, + mode, + paddle::platform::CudnnDataType::kOne(), + desc_, + out.data(), + desc_, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + dx_data)); } else { - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward( - handle, CUDNN_SOFTMAX_ACCURATE, mode, - platform::CudnnDataType::kOne(), desc_, out.data(), desc_, - dout.data(), platform::CudnnDataType::kZero(), desc_, dx_data)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSoftmaxBackward( + handle, + CUDNN_SOFTMAX_ACCURATE, + mode, + paddle::platform::CudnnDataType::kOne(), + desc_, + out.data(), + desc_, + dout.data(), + paddle::platform::CudnnDataType::kZero(), + desc_, + dx_data)); } #endif } } -} // namespace operators -} // namespace paddle +} // namespace phi diff --git a/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu new file mode 100644 index 0000000000000..56e5fef6e37e4 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/softmax_grad_kernel_gpudnn.cu @@ -0,0 +1,50 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void SoftmaxGradGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + dev_ctx.template Alloc(x_grad); + SoftmaxBackwardCUDAKernelDriver(dev_ctx, out, out_grad, axis, x_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(softmax_grad, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxGradGPUDNNKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(softmax_grad, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxGradGPUDNNKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu new file mode 100644 index 0000000000000..427d1729a13a8 --- /dev/null +++ b/paddle/phi/kernels/gpudnn/softmax_kernel_gpudnn.cu @@ -0,0 +1,49 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h" + +namespace phi { + +template +void SoftmaxRawGPUDNNKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + dev_ctx.template Alloc(out); + SoftmaxForwardCUDAKernelDriver(dev_ctx, x, axis, out); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(softmax, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxRawGPUDNNKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(softmax, + GPUDNN, + ALL_LAYOUT, + phi::SoftmaxRawGPUDNNKernel, + float, + double, + phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h new file mode 100644 index 0000000000000..915bf16a92df1 --- /dev/null +++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/softmax_grad_kernel.h" + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void SoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad) { + const int rank = x_grad->dims().size(); + const int calc_axis = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = x_grad->dims()[calc_axis]; + + // allocate memory on device. + dev_ctx.template Alloc(x_grad); + if (x_grad->numel() == 0) { + return; + } + + const int n = phi::funcs::SizeToAxis(calc_axis, x_grad->dims()); + const int d = phi::funcs::SizeFromAxis(calc_axis, x_grad->dims()); + DenseTensor dX_2d, Out_2d, dOut_2d; + dX_2d.ShareDataWith(*x_grad).Resize({n, d}); + Out_2d.ShareDataWith(out).Resize({n, d}); + dOut_2d.ShareDataWith(out_grad).Resize({n, d}); + + paddle::operators::math::SoftmaxGradFunctor()( + dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/softmax_kernel_impl.h b/paddle/phi/kernels/impl/softmax_kernel_impl.h new file mode 100644 index 0000000000000..6552f6ed581f4 --- /dev/null +++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/kernels/softmax_kernel.h" + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void SoftmaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + const int calc_axis = phi::funcs::CanonicalAxis(axis, rank); + int axis_dim = x.dims()[calc_axis]; + + // allocate memory on device. + dev_ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + + const int n = phi::funcs::SizeToAxis(calc_axis, x.dims()); + const int d = phi::funcs::SizeFromAxis(calc_axis, x.dims()); + DenseTensor X_2d, Out_2d; + X_2d.ShareDataWith(x).Resize({n, d}); + Out_2d.ShareDataWith(*out).Resize({n, d}); + paddle::operators::math::SoftmaxFunctor()( + dev_ctx, axis_dim, &X_2d, &Out_2d); +} + +} // namespace phi diff --git a/paddle/phi/kernels/softmax_grad_kernel.h b/paddle/phi/kernels/softmax_grad_kernel.h new file mode 100644 index 0000000000000..4ecf65c1f17c7 --- /dev/null +++ b/paddle/phi/kernels/softmax_grad_kernel.h @@ -0,0 +1,29 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { + +template +void SoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + int axis, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/softmax_kernel.h b/paddle/phi/kernels/softmax_kernel.h new file mode 100644 index 0000000000000..ca69d652770aa --- /dev/null +++ b/paddle/phi/kernels/softmax_kernel.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/cast_kernel.h" + +namespace phi { + +template +void SoftmaxRawKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DenseTensor* out); + +template +void SoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + int axis, + DataType dtype, + DenseTensor* out) { + auto cast_x = phi::Cast(dev_ctx, x, dtype); + phi::SoftmaxRawKernel(dev_ctx, axis, out); +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/softmax_sig.cc b/paddle/phi/ops/compat/softmax_sig.cc new file mode 100644 index 0000000000000..65a915b51d08a --- /dev/null +++ b/paddle/phi/ops/compat/softmax_sig.cc @@ -0,0 +1,34 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature SoftmaxOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("softmax", {"X"}, {"axis"}, {"Out"}); +} + +KernelSignature SoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(softmax, phi::SoftmaxOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(softmax_grad, phi::SoftmaxGradOpArgumentMapping); diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc index d74a35c9eae2e..fa4ffc84bf587 100644 --- a/paddle/phi/tests/common/test_backend.cc +++ b/paddle/phi/tests/common/test_backend.cc @@ -41,8 +41,8 @@ TEST(Backend, OStream) { oss << phi::Backend::MKLDNN; EXPECT_EQ(oss.str(), "MKLDNN"); oss.str(""); - oss << phi::Backend::CUDNN; - EXPECT_EQ(oss.str(), "CUDNN"); + oss << phi::Backend::GPUDNN; + EXPECT_EQ(oss.str(), "GPUDNN"); oss.str(""); try { oss << phi::Backend::NUM_BACKENDS; @@ -60,7 +60,7 @@ TEST(Backend, StringToBackend) { EXPECT_EQ(phi::Backend::XPU, pexp::StringToBackend("XPU")); EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU")); EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN")); - EXPECT_EQ(phi::Backend::CUDNN, pexp::StringToBackend("CUDNN")); + EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN")); EXPECT_EQ(static_cast( static_cast(phi::Backend::NUM_BACKENDS) + 1), pexp::StringToBackend("CustomBackend")); From 44da9b420927b4e42ec0ad317d0acb8d0575a3c1 Mon Sep 17 00:00:00 2001 From: joeqiao12 <45232181+joeqiao12@users.noreply.github.com> Date: Fri, 25 Feb 2022 10:32:54 +0800 Subject: [PATCH 48/85] add reduce_min and reduce_max (#39899) --- .../operators/reduce_ops/reduce_max_op_mlu.cc | 93 ++++++++++ .../operators/reduce_ops/reduce_min_op_mlu.cc | 93 ++++++++++ paddle/fluid/operators/reduce_ops/reduce_op.h | 11 +- .../unittests/mlu/test_reduce_max_op_mlu.py | 170 ++++++++++++++++++ .../unittests/mlu/test_reduce_min_op_mlu.py | 170 ++++++++++++++++++ 5 files changed, 532 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc create mode 100644 paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc new file mode 100644 index 0000000000000..7e02f0268b5e5 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMaxMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + int out_dtype = context.Attr("out_dtype"); + bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + auto input_dims = framework::vectorize(input->dims()); + const auto& input_dim_size = input->dims().size(); + std::vector reduce_dims; + if (reduce_all) { + for (size_t i = 0; i < input_dims.size(); i++) { + reduce_dims.push_back(static_cast(i)); + } + } else { + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + reduce_dims.push_back(dims[i] + input_dim_size); + } else { + reduce_dims.push_back(dims[i]); + } + } + } + + auto place = context.GetPlace(); + framework::Tensor cast_out(input->type()); + cast_out.Resize(output->dims()); + cast_out.mutable_data(place); + + auto cast_out_dtype = framework::TransToProtoVarType(input->dtype()); + + if (out_dtype != -1) { + cast_out_dtype = static_cast(out_dtype); + } + if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) { + if (cast_out_dtype == framework::proto::VarType::FP32) { + output->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::FP16) { + output->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::INT32) { + output->mutable_data(place); + } + } else { + output->ShareDataWith(cast_out); + } + + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input->dtype())); + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + MLUCnnlReduceDesc reduction_desc( + reduce_dims, CNNL_REDUCE_MAX, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), + nullptr, input_desc.get(), GetBasePtr(input), + 0 /*indices_size*/, nullptr, nullptr, output_desc.get(), + GetBasePtr(output)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(reduce_max, ops::ReduceMaxMLUKernel, + ops::ReduceMaxMLUKernel, + ops::ReduceMaxMLUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc new file mode 100644 index 0000000000000..daf5965fd5462 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_min_op_mlu.cc @@ -0,0 +1,93 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/mlu/mlu_baseop.h" +#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" + +namespace paddle { +namespace operators { + +template +class ReduceMinMLUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + int out_dtype = context.Attr("out_dtype"); + bool reduce_all = context.Attr("reduce_all"); + auto dims = context.Attr>("dim"); + auto input_dims = framework::vectorize(input->dims()); + const auto& input_dim_size = input->dims().size(); + std::vector reduce_dims; + if (reduce_all) { + for (size_t i = 0; i < input_dims.size(); i++) { + reduce_dims.push_back(static_cast(i)); + } + } else { + for (size_t i = 0; i < dims.size(); ++i) { + if (dims[i] < 0) { + reduce_dims.push_back(dims[i] + input_dim_size); + } else { + reduce_dims.push_back(dims[i]); + } + } + } + + auto place = context.GetPlace(); + framework::Tensor cast_out(input->type()); + cast_out.Resize(output->dims()); + cast_out.mutable_data(place); + + auto cast_out_dtype = framework::TransToProtoVarType(input->dtype()); + + if (out_dtype != -1) { + cast_out_dtype = static_cast(out_dtype); + } + if (framework::TransToProtoVarType(input->type()) != cast_out_dtype) { + if (cast_out_dtype == framework::proto::VarType::FP32) { + output->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::FP16) { + output->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::INT32) { + output->mutable_data(place); + } + } else { + output->ShareDataWith(cast_out); + } + + MLUCnnlTensorDesc input_desc(*input, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(input->dtype())); + MLUCnnlTensorDesc output_desc(*output, CNNL_LAYOUT_ARRAY, + ToCnnlDataType(output->dtype())); + + MLUCnnlReduceDesc reduction_desc( + reduce_dims, CNNL_REDUCE_MIN, ToCnnlDataType(), + CNNL_NOT_PROPAGATE_NAN, CNNL_REDUCE_NO_INDICES, CNNL_32BIT_INDICES); + + MLUCnnl::Reduce(context, true /*need_workspace*/, reduction_desc.get(), + nullptr, input_desc.get(), GetBasePtr(input), + 0 /*indices_size*/, nullptr, nullptr, output_desc.get(), + GetBasePtr(output)); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_MLU_KERNEL(reduce_min, ops::ReduceMinMLUKernel, + ops::ReduceMinMLUKernel, + ops::ReduceMinMLUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index ca3575f5dea84..eb39f069e56b7 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -541,11 +541,12 @@ class ReduceOp : public framework::OperatorWithKernel { #endif if (input_data_type == framework::proto::VarType::FP16) { - PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()) || - platform::is_npu_place(ctx.GetPlace()), - true, - platform::errors::InvalidArgument( - "float16 can only be used on GPU or NPU place")); + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()) || + platform::is_npu_place(ctx.GetPlace()) || + platform::is_mlu_place(ctx.GetPlace()), + true, platform::errors::InvalidArgument( + "float16 can only be used on GPU or NPU or MLU place")); } return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py new file mode 100644 index 0000000000000..ef33719d368e8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_max_op_mlu.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestMLUReduceMaxOp(OpTest): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [-1]} + self.outputs = { + 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMaxOpMultiAxises(TestMLUReduceMaxOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [-2, -1]} + self.outputs = { + 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])) + } + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceAll(TestMLUReduceMaxOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'reduce_all': True} + self.outputs = {'Out': self.inputs['X'].max()} + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMaxOpWithOutDtype_int32(TestMLUReduceMaxOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': + self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.int32) + } + + def init_dtype(self): + self.dtype = np.int32 + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMaxOpWithOutDtype_fp16(TestMLUReduceMaxOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.FP16) + } + self.outputs = { + 'Out': self.inputs['X'].max( + axis=tuple(self.attrs['dim'])).astype(np.float16) + } + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +@skip_check_grad_ci( + reason="reduce_max is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMaxOpWithOutDtype_fp32(TestMLUReduceMaxOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_max" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + self.outputs = { + 'Out': self.inputs['X'].max( + axis=tuple(self.attrs['dim'])).astype(np.float32) + } + + def init_dtype(self): + self.dtype = np.float32 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py new file mode 100644 index 0000000000000..284f8f984c232 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_min_op_mlu.py @@ -0,0 +1,170 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestMLUReduceMinOp(OpTest): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [-1]} + self.outputs = { + 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_mlu(self): + self.__class__.use_mlu = True + self.place = paddle.MLUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMinOpMultiAxises(TestMLUReduceMinOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [-2, -1]} + self.outputs = { + 'Out': self.inputs['X'].min(axis=tuple(self.attrs['dim'])) + } + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceAll(TestMLUReduceMinOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'reduce_all': True} + self.outputs = {'Out': self.inputs['X'].min()} + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMinOpWithOutDtype_int32(TestMLUReduceMinOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.INT32) + } + self.outputs = { + 'Out': + self.inputs['X'].min(axis=tuple(self.attrs['dim'])).astype(np.int32) + } + + def init_dtype(self): + self.dtype = np.int32 + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMinOpWithOutDtype_fp16(TestMLUReduceMinOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.FP16) + } + self.outputs = { + 'Out': self.inputs['X'].min( + axis=tuple(self.attrs['dim'])).astype(np.float16) + } + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +@skip_check_grad_ci( + reason="reduce_min is discontinuous non-derivable function," + " its gradient check is not supported by unittest framework.") +class TestReduceMinOpWithOutDtype_fp32(TestMLUReduceMinOp): + """Remove Min with subgradient from gradient check to confirm the success of CI.""" + + def setUp(self): + self.op_type = "reduce_min" + self.set_mlu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = { + 'dim': [-2, -1], + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + self.outputs = { + 'Out': self.inputs['X'].min( + axis=tuple(self.attrs['dim'])).astype(np.float32) + } + + def init_dtype(self): + self.dtype = np.float32 + + +if __name__ == '__main__': + unittest.main() From 22f84122411d738bba344bdf3124e414126aa921 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Fri, 25 Feb 2022 10:55:14 +0800 Subject: [PATCH 49/85] [phi] refine code of randint, randperm, unbind kernel (#39909) * refine randint kernel * refine randperm kernel * refine unbind kernel * support op seed --- paddle/phi/kernels/cpu/randint_kernel.cc | 17 ++++---- paddle/phi/kernels/cpu/randperm_kernel.cc | 38 +++++++++++++---- paddle/phi/kernels/cpu/unbind_kernel.cc | 1 + .../kernels/funcs/concat_and_split_functor.cc | 15 ------- .../kernels/funcs/concat_and_split_functor.cu | 18 ++------ .../kernels/funcs/concat_and_split_functor.h | 16 ++++---- paddle/phi/kernels/gpu/randint_kernel.cu | 21 +++++----- paddle/phi/kernels/gpu/randperm_kernel.cu | 41 ++++++++++++++----- paddle/phi/kernels/gpu/unbind_kernel.cu | 3 +- paddle/phi/kernels/impl/unbind_kernel_impl.h | 6 +-- paddle/phi/kernels/randint_kernel.h | 4 +- paddle/phi/kernels/randperm_kernel.h | 6 ++- paddle/phi/ops/compat/randperm_sig.cc | 7 +++- 13 files changed, 108 insertions(+), 85 deletions(-) diff --git a/paddle/phi/kernels/cpu/randint_kernel.cc b/paddle/phi/kernels/cpu/randint_kernel.cc index 5fe56b57452d5..feb418949ba40 100644 --- a/paddle/phi/kernels/cpu/randint_kernel.cc +++ b/paddle/phi/kernels/cpu/randint_kernel.cc @@ -22,42 +22,43 @@ namespace phi { template -void RandintRawKernel(const Context& ctx, +void RandintRawKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, DataType dtype, int seed, DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); - auto size = out->numel(); + out->Resize(phi::make_ddim(shape.GetData())); + T* data = dev_ctx.template Alloc(out); + auto numel = out->numel(); std::shared_ptr engine; if (seed) { engine = std::make_shared(); engine->seed(seed); } else { - engine = ctx.GetGenerator()->GetCPUEngine(); + engine = dev_ctx.GetGenerator()->GetCPUEngine(); } std::uniform_int_distribution dist(low, high - 1); - auto data = out->data(); - for (int64_t i = 0; i < size; ++i) { + for (int64_t i = 0; i < numel; ++i) { data[i] = dist(*engine); } } template -void RandintKernel(const Context& ctx, +void RandintKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, DataType dtype, DenseTensor* out) { - RandintRawKernel(ctx, low, high, shape, dtype, 0, out); + RandintRawKernel(dev_ctx, low, high, shape, dtype, 0, out); } } // namespace phi PD_REGISTER_KERNEL( randint_raw, CPU, ALL_LAYOUT, phi::RandintRawKernel, int, int64_t) {} + PD_REGISTER_KERNEL(randint, CPU, ALL_LAYOUT, phi::RandintKernel, int, int64_t) { } diff --git a/paddle/phi/kernels/cpu/randperm_kernel.cc b/paddle/phi/kernels/cpu/randperm_kernel.cc index 28092c8df6d15..6cb435f53b85b 100644 --- a/paddle/phi/kernels/cpu/randperm_kernel.cc +++ b/paddle/phi/kernels/cpu/randperm_kernel.cc @@ -13,20 +13,23 @@ // limitations under the License. #include "paddle/phi/kernels/randperm_kernel.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" + #include "paddle/phi/core/kernel_registry.h" namespace phi { template -void RandpermKernel(const Context& ctx, - int n, - DataType dtype, - DenseTensor* out) { - T* out_data = ctx.template Alloc(out); - auto gen_ptr = ctx.GetHostGenerator(); - auto engine = gen_ptr->GetCPUEngine(); +void RandpermRawKernel( + const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { + T* out_data = dev_ctx.template Alloc(out); + + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetGenerator()->GetCPUEngine(); + } for (int i = 0; i < n; ++i) { out_data[i] = static_cast(i); @@ -34,8 +37,25 @@ void RandpermKernel(const Context& ctx, std::shuffle(out_data, out_data + n, *engine); } +template +void RandpermKernel(const Context& dev_ctx, + int n, + DataType dtype, + DenseTensor* out) { + RandpermRawKernel(dev_ctx, n, dtype, 0, out); +} + } // namespace phi +PD_REGISTER_KERNEL(randperm_raw, + CPU, + ALL_LAYOUT, + phi::RandpermRawKernel, + float, + double, + int, + int64_t) {} + PD_REGISTER_KERNEL(randperm, CPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/cpu/unbind_kernel.cc b/paddle/phi/kernels/cpu/unbind_kernel.cc index 655f8c8aafbf2..39cc2f8fc4662 100644 --- a/paddle/phi/kernels/cpu/unbind_kernel.cc +++ b/paddle/phi/kernels/cpu/unbind_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/unbind_kernel.h" + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unbind_kernel_impl.h" diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cc b/paddle/phi/kernels/funcs/concat_and_split_functor.cc index c8405703a5c16..aa73ba5f68990 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cc +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cc @@ -12,21 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/phi/core/utils/data_type.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" namespace phi { diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.cu b/paddle/phi/kernels/funcs/concat_and_split_functor.cu index 2abfdb606e7e6..840c8872f50f8 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.cu +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.cu @@ -12,23 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include - -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" - -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/core/utils/data_type.h" #include "paddle/phi/kernels/funcs/concat_and_split_functor.h" +#include "paddle/fluid/memory/malloc.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/funcs/concat_and_split_functor.h b/paddle/phi/kernels/funcs/concat_and_split_functor.h index 3af4d878d3cab..4cb15fe539b66 100644 --- a/paddle/phi/kernels/funcs/concat_and_split_functor.h +++ b/paddle/phi/kernels/funcs/concat_and_split_functor.h @@ -13,20 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include -#include + #include -#include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/tensor.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device_context.h" -#include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" #include "paddle/phi/core/utils/data_type.h" +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/memory/memcpy.h" + namespace phi { namespace funcs { diff --git a/paddle/phi/kernels/gpu/randint_kernel.cu b/paddle/phi/kernels/gpu/randint_kernel.cu index b89b714c73d92..66dc5f72a5c70 100644 --- a/paddle/phi/kernels/gpu/randint_kernel.cu +++ b/paddle/phi/kernels/gpu/randint_kernel.cu @@ -25,7 +25,7 @@ namespace phi { template -void RandintRawKernel(const Context& ctx, +void RandintRawKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, @@ -34,21 +34,22 @@ void RandintRawKernel(const Context& ctx, DenseTensor* out) { DenseTensor tmp; tmp.Resize(phi::make_ddim(shape.GetData())); - T* tmp_data = ctx.template HostAlloc(&tmp); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - out->ResizeAndAllocate(tmp.dims()); - auto size = out->numel(); + out->Resize(tmp.dims()); + T* data = dev_ctx.template Alloc(out); std::shared_ptr engine; if (seed) { engine = std::make_shared(); engine->seed(seed); } else { - engine = ctx.GetHostGenerator()->GetCPUEngine(); + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); } + std::uniform_int_distribution dist(low, high - 1); - auto data = out->data(); - for (int64_t i = 0; i < size; ++i) { + auto numel = out->numel(); + for (int64_t i = 0; i < numel; ++i) { tmp_data[i] = dist(*engine); } @@ -57,18 +58,18 @@ void RandintRawKernel(const Context& ctx, data, tmp.place(), tmp_data, - size * paddle::experimental::SizeOf(out->dtype()), + numel * paddle::experimental::SizeOf(out->dtype()), 0); } template -void RandintKernel(const Context& ctx, +void RandintKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, DataType dtype, DenseTensor* out) { - RandintRawKernel(ctx, low, high, shape, dtype, 0, out); + RandintRawKernel(dev_ctx, low, high, shape, dtype, 0, out); } } // namespace phi diff --git a/paddle/phi/kernels/gpu/randperm_kernel.cu b/paddle/phi/kernels/gpu/randperm_kernel.cu index f75f768b633a3..d4d90cac917a2 100644 --- a/paddle/phi/kernels/gpu/randperm_kernel.cu +++ b/paddle/phi/kernels/gpu/randperm_kernel.cu @@ -12,41 +12,60 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/core/device_context.h" #include "paddle/phi/kernels/randperm_kernel.h" +#include "paddle/phi/core/kernel_registry.h" + // See Note [ Why still include the fluid headers? ] #include "paddle/fluid/memory/memcpy.h" -#include "paddle/phi/core/kernel_registry.h" namespace phi { template -void RandpermKernel(const Context& ctx, - int n, - DataType dtype, - DenseTensor* out) { +void RandpermRawKernel( + const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out) { DenseTensor tmp; tmp.Resize(phi::make_ddim({n})); - T* tmp_data = ctx.template HostAlloc(&tmp); + T* tmp_data = dev_ctx.template HostAlloc(&tmp); - auto gen_ptr = ctx.GetHostGenerator(); - auto engine = gen_ptr->GetCPUEngine(); + std::shared_ptr engine; + if (seed) { + engine = std::make_shared(); + engine->seed(seed); + } else { + engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + } for (int i = 0; i < n; ++i) { tmp_data[i] = static_cast(i); } std::shuffle(tmp_data, tmp_data + n, *engine); - T* out_data = ctx.template Alloc(out); + T* out_data = dev_ctx.template Alloc(out); auto size = out->numel() * paddle::experimental::SizeOf(out->dtype()); paddle::memory::Copy( out->place(), out_data, tmp.place(), tmp_data, size, 0); } +template +void RandpermKernel(const Context& dev_ctx, + int n, + DataType dtype, + DenseTensor* out) { + RandpermRawKernel(dev_ctx, n, dtype, 0, out); +} + } // namespace phi +PD_REGISTER_KERNEL(randperm_raw, + GPU, + ALL_LAYOUT, + phi::RandpermRawKernel, + float, + double, + int, + int64_t) {} + PD_REGISTER_KERNEL(randperm, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/gpu/unbind_kernel.cu b/paddle/phi/kernels/gpu/unbind_kernel.cu index 1efc3a1094da2..8a7aa8f6033ab 100644 --- a/paddle/phi/kernels/gpu/unbind_kernel.cu +++ b/paddle/phi/kernels/gpu/unbind_kernel.cu @@ -12,9 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/phi/kernels/unbind_kernel.h" + #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/unbind_kernel_impl.h" -#include "paddle/phi/kernels/unbind_kernel.h" PD_REGISTER_KERNEL(unbind, GPU, diff --git a/paddle/phi/kernels/impl/unbind_kernel_impl.h b/paddle/phi/kernels/impl/unbind_kernel_impl.h index 8a1342559bd90..3e233a2038e48 100644 --- a/paddle/phi/kernels/impl/unbind_kernel_impl.h +++ b/paddle/phi/kernels/impl/unbind_kernel_impl.h @@ -20,7 +20,7 @@ namespace phi { template -void UnbindKernel(const Context& ctx, +void UnbindKernel(const Context& dev_ctx, const DenseTensor& x, int axis, std::vector outs) { @@ -29,12 +29,12 @@ void UnbindKernel(const Context& ctx, std::vector shape_refer; for (size_t j = 0; j < outs.size(); ++j) { - ctx.template Alloc(outs[j]); + dev_ctx.template Alloc(outs[j]); shape_refer.emplace_back(outs[j]); } phi::funcs::SplitFunctor functor; - functor(ctx, x, shape_refer, axis, &outs); + functor(dev_ctx, x, shape_refer, axis, &outs); } } // namespace phi diff --git a/paddle/phi/kernels/randint_kernel.h b/paddle/phi/kernels/randint_kernel.h index 1a78e73d863e3..bfefc628614fb 100644 --- a/paddle/phi/kernels/randint_kernel.h +++ b/paddle/phi/kernels/randint_kernel.h @@ -20,7 +20,7 @@ namespace phi { template -void RandintKernel(const Context& ctx, +void RandintKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, @@ -28,7 +28,7 @@ void RandintKernel(const Context& ctx, DenseTensor* out); template -void RandintRawKernel(const Context& ctx, +void RandintRawKernel(const Context& dev_ctx, int low, int high, const ScalarArray& shape, diff --git a/paddle/phi/kernels/randperm_kernel.h b/paddle/phi/kernels/randperm_kernel.h index 63bdac6da6fdc..70b95db98bef9 100644 --- a/paddle/phi/kernels/randperm_kernel.h +++ b/paddle/phi/kernels/randperm_kernel.h @@ -20,7 +20,11 @@ namespace phi { template -void RandpermKernel(const Context& ctx, +void RandpermRawKernel( + const Context& dev_ctx, int n, DataType dtype, int seed, DenseTensor* out); + +template +void RandpermKernel(const Context& dev_ctx, int n, DataType dtype, DenseTensor* out); diff --git a/paddle/phi/ops/compat/randperm_sig.cc b/paddle/phi/ops/compat/randperm_sig.cc index 14b28512e402a..89548beff6762 100644 --- a/paddle/phi/ops/compat/randperm_sig.cc +++ b/paddle/phi/ops/compat/randperm_sig.cc @@ -17,7 +17,12 @@ namespace phi { KernelSignature RandpermOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"}); + int seed = paddle::any_cast(ctx.Attr("seed")); + if (seed) { + return KernelSignature("randperm", {}, {"n", "dtype", "seed"}, {"Out"}); + } else { + return KernelSignature("randperm", {}, {"n", "dtype"}, {"Out"}); + } } } // namespace phi From 2553af4f41ca27ae33acb137755b2eb1c0686bc6 Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Fri, 25 Feb 2022 11:08:41 +0800 Subject: [PATCH 50/85] [Phi] mv kernel (#39861) [Phi] mv kernel --- paddle/fluid/operators/mv_op.cc | 15 ++-- paddle/fluid/operators/mv_op.cu | 94 -------------------- paddle/fluid/operators/mv_op.h | 105 ----------------------- paddle/phi/kernels/cpu/mv_grad_kernel.cc | 72 ++++++++++++++++ paddle/phi/kernels/cpu/mv_kernel.cc | 22 +++++ paddle/phi/kernels/gpu/mv_grad_kernel.cu | 83 ++++++++++++++++++ paddle/phi/kernels/gpu/mv_kernel.cu | 22 +++++ paddle/phi/kernels/impl/mv_kernel_impl.h | 45 ++++++++++ paddle/phi/kernels/mv_grad_kernel.h | 35 ++++++++ paddle/phi/kernels/mv_kernel.h | 27 ++++++ paddle/phi/ops/compat/mv_sig.cc | 33 +++++++ 11 files changed, 346 insertions(+), 207 deletions(-) delete mode 100644 paddle/fluid/operators/mv_op.cu delete mode 100644 paddle/fluid/operators/mv_op.h create mode 100644 paddle/phi/kernels/cpu/mv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/mv_kernel.cc create mode 100644 paddle/phi/kernels/gpu/mv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/mv_kernel.cu create mode 100644 paddle/phi/kernels/impl/mv_kernel_impl.h create mode 100644 paddle/phi/kernels/mv_grad_kernel.h create mode 100644 paddle/phi/kernels/mv_kernel.h create mode 100644 paddle/phi/ops/compat/mv_sig.cc diff --git a/paddle/fluid/operators/mv_op.cc b/paddle/fluid/operators/mv_op.cc index 01135bab6d1d2..ab9f10070fc60 100644 --- a/paddle/fluid/operators/mv_op.cc +++ b/paddle/fluid/operators/mv_op.cc @@ -12,7 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/mv_op.h" +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" + namespace paddle { namespace operators { @@ -116,10 +122,3 @@ REGISTER_OPERATOR(mv, ops::MVOp, ops::MVOpMaker, ops::MVOpGradMaker, ops::MVOpGradMaker); REGISTER_OPERATOR(mv_grad, ops::MVOpGrad); - -REGISTER_OP_CPU_KERNEL( - mv, ops::MVKernel, - ops::MVKernel); -REGISTER_OP_CPU_KERNEL( - mv_grad, ops::MVGradKernel, - ops::MVGradKernel); diff --git a/paddle/fluid/operators/mv_op.cu b/paddle/fluid/operators/mv_op.cu deleted file mode 100644 index b8b61ae490472..0000000000000 --- a/paddle/fluid/operators/mv_op.cu +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/mv_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" - -namespace paddle { -namespace operators { - -template -__global__ void MVGradDxCUDAKernel(const int m, const int n, const T *dout, - const T *vec, T *dx) { - int idx = blockDim.x * blockIdx.x + threadIdx.x; - for (; idx < m * n; idx += blockDim.x * gridDim.x) { - int i = idx / n; - int j = idx % n; - dx[idx] = dout[i] * vec[j]; - } -} - -// Using dimensional constraints on matrix multiplication, it is -// straight-forward to check the following table for when X and Y -// are both matrices. -// -// dX = | dOut Vec^T -// dVec = | X^T dOut -template -class MVGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *vec = context.Input("Vec"); - auto *dout = - context.Input(framework::GradVarName("Out")); - auto *dx = context.Output(framework::GradVarName("X")); - auto *dvec = - context.Output(framework::GradVarName("Vec")); - - auto dim_x = x->dims(); - int m = dim_x[0]; - int n = dim_x[1]; - - // get data ptr - const T *x_data = x->data(); - const T *vec_data = vec->data(); - const T *dout_data = dout->data(); - - auto &dev_ctx = - context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - auto stream = context.cuda_device_context().stream(); - auto config = GetGpuLaunchConfig1D(dev_ctx, m * n); - - if (dx) { - T *dx_data = dx->mutable_data(context.GetPlace()); - - MVGradDxCUDAKernel< - T><<>>( - m, n, dout_data, vec_data, dx_data); - } - - if (dvec) { - T *dvec_data = dvec->mutable_data(context.GetPlace()); - - blas.GEMV(true, dim_x[0], dim_x[1], static_cast(1), x_data, dout_data, - static_cast(0), dvec_data); - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - mv, ops::MVKernel, - ops::MVKernel); -REGISTER_OP_CUDA_KERNEL( - mv_grad, ops::MVGradKernel, - ops::MVGradKernel); diff --git a/paddle/fluid/operators/mv_op.h b/paddle/fluid/operators/mv_op.h deleted file mode 100644 index c0a2172af3677..0000000000000 --- a/paddle/fluid/operators/mv_op.h +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" -#ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" -#endif - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class MVKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *vec = context.Input("Vec"); - - auto *out = context.Output("Out"); - - auto dim_x = x->dims(); - - // get data ptr - const T *x_data = x->data(); - const T *vec_data = vec->data(); - T *out_data = out->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - blas.GEMV(false, dim_x[0], dim_x[1], static_cast(1), x_data, vec_data, - static_cast(0), out_data); - } -}; - -// Using dimensional constraints on matrix multiplication, it is -// straight-forward to check the following table for when X and Y -// are both matrices. -// -// dX = | dOut vec^T -// dVec = | X^T dOut -template -class MVGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *vec = context.Input("Vec"); - auto *dout = - context.Input(framework::GradVarName("Out")); - auto *dx = context.Output(framework::GradVarName("X")); - auto *dvec = - context.Output(framework::GradVarName("Vec")); - - auto dim_x = x->dims(); - int m = dim_x[0]; - int n = dim_x[1]; - - // get data ptr - const T *x_data = x->data(); - const T *vec_data = vec->data(); - const T *dout_data = dout->data(); - - if (dx) { - T *dx_data = dx->mutable_data(context.GetPlace()); - - for (int i = 0; i < m; ++i) { - for (int j = 0; j < n; ++j) { - dx_data[i * n + j] = dout_data[i] * vec_data[j]; - } - } - } - - if (dvec) { - T *dvec_data = dvec->mutable_data(context.GetPlace()); - - auto &dev_ctx = context.template device_context(); - auto blas = phi::funcs::GetBlas(dev_ctx); - - blas.GEMV(true, dim_x[0], dim_x[1], static_cast(1), x_data, dout_data, - static_cast(0), dvec_data); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/kernels/cpu/mv_grad_kernel.cc b/paddle/phi/kernels/cpu/mv_grad_kernel.cc new file mode 100644 index 0000000000000..c3b7f94be4194 --- /dev/null +++ b/paddle/phi/kernels/cpu/mv_grad_kernel.cc @@ -0,0 +1,72 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mv_grad_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +void MvGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& vec, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* vec_grad) { + auto dout = out_grad; + auto dx = x_grad; + auto dvec = vec_grad; + + auto dim_x = x.dims(); + int m = dim_x[0]; + int n = dim_x[1]; + + // get data ptr + const T* x_data = x.data(); + const T* vec_data = vec.data(); + const T* dout_data = dout.data(); + + if (dx) { + T* dx_data = dev_ctx.template Alloc(dx); + + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + dx_data[i * n + j] = dout_data[i] * vec_data[j]; + } + } + } + + if (dvec) { + T* dvec_data = dev_ctx.template Alloc(dvec); + + auto blas = phi::funcs::GetBlas(dev_ctx); + + blas.GEMV(true, + dim_x[0], + dim_x[1], + static_cast(1), + x_data, + dout_data, + static_cast(0), + dvec_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mv_grad, CPU, ALL_LAYOUT, phi::MvGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/cpu/mv_kernel.cc b/paddle/phi/kernels/cpu/mv_kernel.cc new file mode 100644 index 0000000000000..7f76ddda6dde5 --- /dev/null +++ b/paddle/phi/kernels/cpu/mv_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mv_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/impl/mv_kernel_impl.h" + +PD_REGISTER_KERNEL(mv, CPU, ALL_LAYOUT, phi::MvKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/mv_grad_kernel.cu b/paddle/phi/kernels/gpu/mv_grad_kernel.cu new file mode 100644 index 0000000000000..9eb8cd375ebd6 --- /dev/null +++ b/paddle/phi/kernels/gpu/mv_grad_kernel.cu @@ -0,0 +1,83 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mv_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +__global__ void MVGradDxCUDAKernel( + const int m, const int n, const T *dout, const T *vec, T *dx) { + int idx = blockDim.x * blockIdx.x + threadIdx.x; + for (; idx < m * n; idx += blockDim.x * gridDim.x) { + int i = idx / n; + int j = idx % n; + dx[idx] = dout[i] * vec[j]; + } +} + +template +void MvGradKernel(const Context &dev_ctx, + const DenseTensor &x, + const DenseTensor &vec, + const DenseTensor &out_grad, + DenseTensor *x_grad, + DenseTensor *vec_grad) { + auto dout = out_grad; + auto dx = x_grad; + auto dvec = vec_grad; + + auto dim_x = x.dims(); + int m = dim_x[0]; + int n = dim_x[1]; + + // get data ptr + const T *x_data = x.data(); + const T *vec_data = vec.data(); + const T *dout_data = dout.data(); + + auto blas = phi::funcs::GetBlas(dev_ctx); + auto stream = dev_ctx.stream(); + auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, m * n); + + if (dx) { + T *dx_data = dev_ctx.template Alloc(dx); + + MVGradDxCUDAKernel< + T><<>>( + m, n, dout_data, vec_data, dx_data); + } + + if (dvec) { + T *dvec_data = dev_ctx.template Alloc(dvec); + + blas.GEMV(true, + dim_x[0], + dim_x[1], + static_cast(1), + x_data, + dout_data, + static_cast(0), + dvec_data); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(mv_grad, GPU, ALL_LAYOUT, phi::MvGradKernel, float, double) { +} diff --git a/paddle/phi/kernels/gpu/mv_kernel.cu b/paddle/phi/kernels/gpu/mv_kernel.cu new file mode 100644 index 0000000000000..1faba5a62d2cd --- /dev/null +++ b/paddle/phi/kernels/gpu/mv_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/mv_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/phi/kernels/impl/mv_kernel_impl.h" + +PD_REGISTER_KERNEL(mv, GPU, ALL_LAYOUT, phi::MvKernel, float, double) {} diff --git a/paddle/phi/kernels/impl/mv_kernel_impl.h b/paddle/phi/kernels/impl/mv_kernel_impl.h new file mode 100644 index 0000000000000..1754ea323ceb9 --- /dev/null +++ b/paddle/phi/kernels/impl/mv_kernel_impl.h @@ -0,0 +1,45 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/blas/blas.h" + +namespace phi { + +template +void MvKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& vec, + DenseTensor* out) { + auto dim_x = x.dims(); + + // get data ptr + const T* x_data = x.data(); + const T* vec_data = vec.data(); + T* out_data = dev_ctx.template Alloc(out); + + auto blas = phi::funcs::GetBlas(dev_ctx); + + blas.GEMV(false, + dim_x[0], + dim_x[1], + static_cast(1), + x_data, + vec_data, + static_cast(0), + out_data); +} + +} // namespace phi diff --git a/paddle/phi/kernels/mv_grad_kernel.h b/paddle/phi/kernels/mv_grad_kernel.h new file mode 100644 index 0000000000000..edc73d89367ff --- /dev/null +++ b/paddle/phi/kernels/mv_grad_kernel.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +// Using dimensional constraints on matrix multiplication, it is +// straight-forward to check the following table for when X and Y +// are both matrices. +// +// dX = | dOut vec^T +// dVec = | X^T dOut +template +void MvGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& vec, + const DenseTensor& out_grad, + DenseTensor* x_grad, + DenseTensor* vec_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/mv_kernel.h b/paddle/phi/kernels/mv_kernel.h new file mode 100644 index 0000000000000..ab4f0b82794ab --- /dev/null +++ b/paddle/phi/kernels/mv_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void MvKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& vec, + DenseTensor* out); + +} // namepsace phi diff --git a/paddle/phi/ops/compat/mv_sig.cc b/paddle/phi/ops/compat/mv_sig.cc new file mode 100644 index 0000000000000..ab0d31ee31dab --- /dev/null +++ b/paddle/phi/ops/compat/mv_sig.cc @@ -0,0 +1,33 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature MvOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mv", {"X", "Vec"}, {}, {"Out"}); +} + +KernelSignature MvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("mv_grad", + {"X", "Vec", GradVarName("Out")}, + {}, + {GradVarName("X"), GradVarName("Vec")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(mv, phi::MvOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(mv_grad, phi::MvGradOpArgumentMapping); From 2fedd39bcf8fc0d74d693e299d1c11019300fbe7 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Fri, 25 Feb 2022 11:50:18 +0800 Subject: [PATCH 51/85] [bf16] add bf16 kernel: elementwise_add elementwise_mul elementwise_sub (#39716) * add ele_add * add ele_mul * add ele_sub * sovle conflict * fix npu * refine ele_add * add ele_mul unittest * refine ele_sub * refine ci * refine unittest --- .../elementwise/elementwise_add_op.cu | 1 + .../elementwise/elementwise_mul_op.cc | 8 ++++ .../elementwise/elementwise_mul_op.cu | 6 +++ .../elementwise/elementwise_sub_op.cc | 6 +++ .../elementwise/elementwise_sub_op.cu | 6 +++ .../kernels/cpu/elementwise_grad_kernel.cc | 2 + paddle/phi/kernels/cpu/math_kernel.cc | 6 ++- paddle/phi/kernels/funcs/blas/blas_impl.h | 30 +++++++++++++ .../kernels/gpu/elementwise_grad_kernel.cu | 5 +++ paddle/phi/kernels/gpu/math_kernel.cu | 5 ++- paddle/phi/kernels/math_kernel.cc | 7 +++- .../unittests/test_elementwise_add_op.py | 42 ++++++++++++++++++- .../unittests/test_elementwise_mul_op.py | 35 +++++++++++++++- .../unittests/test_elementwise_sub_op.py | 30 ++++++++++++- .../test_imperative_auto_mixed_precision.py | 2 +- 15 files changed, 182 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 2b55d9fbaf6cb..52bf9b0e03f02 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -24,5 +24,6 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, ops::ElementwiseAddKernel>, ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc index 5ff0f29ab43a0..e172279145e28 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc @@ -167,6 +167,8 @@ REGISTER_OP_CPU_KERNEL( ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulKernel, ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, ops::ElementwiseMulKernel>, ops::ElementwiseMulKernel>); REGISTER_OP_CUDA_KERNEL( @@ -110,6 +111,7 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, ops::ElementwiseMulGradKernel>, ops::ElementwiseMulGradKernel, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel, + ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulDoubleGradKernel>, ops::ElementwiseMulDoubleGradKernel, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel, + ops::ElementwiseMulTripleGradKernel, ops::ElementwiseMulTripleGradKernel>, ops::ElementwiseMulTripleGradKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel>, ops::ElementwiseSubKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel>, ops::ElementwiseSubGradKernel, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, ops::ElementwiseSubKernel, @@ -34,6 +36,8 @@ REGISTER_OP_CUDA_KERNEL( ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, ops::ElementwiseSubGradKernel, @@ -51,6 +55,8 @@ REGISTER_OP_CUDA_KERNEL( int>, ops::ElementwiseSubDoubleGradKernel, + ops::ElementwiseSubDoubleGradKernel, ops::ElementwiseSubDoubleGradKernel>, ops::ElementwiseSubDoubleGradKernel, phi::dtype::complex) {} @@ -182,5 +183,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int16_t, int, int64_t, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/cpu/math_kernel.cc b/paddle/phi/kernels/cpu/math_kernel.cc index 581c5f90f35e5..5cfcfe62c7816 100644 --- a/paddle/phi/kernels/cpu/math_kernel.cc +++ b/paddle/phi/kernels/cpu/math_kernel.cc @@ -139,7 +139,8 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide_raw, CPU, ALL_LAYOUT, @@ -160,7 +161,8 @@ PD_REGISTER_KERNEL(multiply_raw, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(sum_raw, CPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index 4d7700a89d27b..2868aa5acb75e 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -76,6 +76,36 @@ struct CBlas { "Blas VCOPY do not supported on CPU with bfloat16," " please check your code")); } + + template + static void VADD(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } + } + + template + static void VMUL(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } + } + + template + static void VSUB(int n, + const phi::dtype::bfloat16 *x, + const phi::dtype::bfloat16 *y, + phi::dtype::bfloat16 *z) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] - y[i]; + } + } }; #ifdef PADDLE_WITH_MKLML diff --git a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu index 02dbb506c4eb5..3c4c01b1dc8ff 100644 --- a/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/elementwise_grad_kernel.cu @@ -128,6 +128,7 @@ PD_REGISTER_KERNEL(add_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -140,6 +141,7 @@ PD_REGISTER_KERNEL(add_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -152,6 +154,7 @@ PD_REGISTER_KERNEL(add_triple_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -164,6 +167,7 @@ PD_REGISTER_KERNEL(subtract_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} @@ -176,5 +180,6 @@ PD_REGISTER_KERNEL(subtract_double_grad, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, phi::dtype::complex, phi::dtype::complex) {} diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu index 02e3f00bd3425..56e8b16ccbe0d 100644 --- a/paddle/phi/kernels/gpu/math_kernel.cu +++ b/paddle/phi/kernels/gpu/math_kernel.cu @@ -106,6 +106,7 @@ PD_REGISTER_KERNEL(add_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract_raw, @@ -118,6 +119,7 @@ PD_REGISTER_KERNEL(subtract_raw, int, int64_t, float16, + bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(divide_raw, @@ -143,7 +145,8 @@ PD_REGISTER_KERNEL(multiply_raw, bool, float16, complex64, - complex128) {} + complex128, + bfloat16) {} PD_REGISTER_KERNEL(sum_raw, GPU, ALL_LAYOUT, diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc index db6c5e1ac3591..3cb7b66ddf73e 100644 --- a/paddle/phi/kernels/math_kernel.cc +++ b/paddle/phi/kernels/math_kernel.cc @@ -121,7 +121,8 @@ PD_REGISTER_KERNEL(subtract, int, int64_t, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} PD_REGISTER_KERNEL(divide, CPU, ALL_LAYOUT, @@ -142,7 +143,8 @@ PD_REGISTER_KERNEL(multiply, int64_t, bool, complex64, - complex128) {} + complex128, + phi::dtype::bfloat16) {} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(mean, @@ -180,6 +182,7 @@ PD_REGISTER_KERNEL(add, int, int64_t, phi::dtype::float16, + phi::dtype::bfloat16, complex64, complex128) {} PD_REGISTER_KERNEL(subtract, diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py index d067a2bd57788..d1d391a3949ea 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py @@ -17,7 +17,7 @@ import numpy as np import paddle import paddle.fluid.core as core -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard @@ -98,6 +98,46 @@ def test_check_output(self): place, atol=1e-3, check_dygraph=(self.use_mkldnn == False)) +@unittest.skipIf( + not core.is_compiled_with_cuda() or core.cudnn_version() < 8100, + "core is not compiled with CUDA and cudnn version need larger than 8.1.0") +class TestBF16ElementwiseAddOp(OpTest): + def setUp(self): + self.op_type = "elementwise_add" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.add(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + self.outputs = {'Out': convert_float_to_uint16(self.out)} + + def test_check_output(self): + place = core.CUDAPlace(0) + self.check_output_with_place(place) + + def test_check_grad_normal(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + place = core.CUDAPlace(0) + self.check_grad_with_place(place, ['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseAddOp_scalar(TestElementwiseAddOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py index 7bace9bc53524..00967cb503fe5 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py @@ -23,7 +23,7 @@ from paddle.fluid import Program, compiler, program_guard from paddle.fluid.op import Operator -from op_test import OpTest, skip_check_grad_ci +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class ElementwiseMulOp(OpTest): @@ -83,6 +83,39 @@ def init_axis(self): pass +class TestBF16ElementwiseMulOp(OpTest): + def setUp(self): + self.op_type = "elementwise_mul" + self.dtype = np.uint16 + + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + self.out = np.multiply(self.x, self.y) + + self.axis = -1 + + self.inputs = { + 'X': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.x)), + 'Y': + OpTest.np_dtype_to_fluid_dtype(convert_float_to_uint16(self.y)) + } + self.outputs = {'Out': convert_float_to_uint16(self.out)} + self.attrs = {'axis': self.axis, 'use_mkldnn': False} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseMulOp_scalar(ElementwiseMulOp): diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py index 2594c96eebd69..6801a4bc5f30b 100644 --- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py +++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py @@ -17,7 +17,8 @@ import numpy as np import paddle import paddle.fluid as fluid -from op_test import OpTest, skip_check_grad_ci +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16 class TestElementwiseOp(OpTest): @@ -44,6 +45,33 @@ def test_check_grad_ingore_y(self): ['X'], 'Out', max_relative_error=0.005, no_grad_set=set('Y')) +class TestBF16ElementwiseOp(OpTest): + def setUp(self): + self.op_type = "elementwise_sub" + self.dtype = np.uint16 + x = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + y = np.random.uniform(0.1, 1, [13, 17]).astype(np.float32) + out = x - y + + self.inputs = { + 'X': convert_float_to_uint16(x), + 'Y': convert_float_to_uint16(y) + } + self.outputs = {'Out': convert_float_to_uint16(out)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad(['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad(['X'], 'Out', no_grad_set=set('Y')) + + @skip_check_grad_ci( reason="[skip shape check] Use y_shape(1) to test broadcast.") class TestElementwiseSubOp_scalar(TestElementwiseOp): diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 306c6b4707e8a..0043a7f78b4b3 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -1143,7 +1143,7 @@ def train(self, enable_amp=True): def test_bf16(self): out_fp32 = self.train(enable_amp=False) out_bf16 = self.train(enable_amp=True) - self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-2)) + self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-1)) if __name__ == '__main__': From bbe5228ca812b27b4c1514d928d894b61fd5a543 Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 25 Feb 2022 12:27:57 +0800 Subject: [PATCH 52/85] Optimize perf of softmax_with_cross_entropy (#39553) * Optimize perf of softmax_with_cross_entropy * fix * fix * fix accuracy error --- .../softmax_with_cross_entropy_op.cu | 296 +++++++++++++++++- 1 file changed, 289 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index fd035df768dbd..92e2adb3ee8d2 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -27,6 +27,8 @@ namespace cub = hipcub; namespace paddle { namespace operators { +#define ALIGN_BYTES 16 + using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using DataLayout = platform::DataLayout; using Tensor = framework::Tensor; @@ -47,6 +49,18 @@ static __device__ __forceinline__ T Exp(T x) { return math::TolerableValue()(static_cast(expx)); } +template +struct ExpAddFunctor { + HOSTDEVICE inline ExpAddFunctor(Tx max) : max(max) {} + + HOSTDEVICE inline Ty operator()(const Tx& sum, const Tx& x) const { + return static_cast(sum + std::exp(x - max)); + } + + private: + Tx max; +}; + // log2(value) static inline int Log2Ceil(int value) { int log2_value = 0; @@ -419,10 +433,272 @@ void SwitchWarpSoftmaxForward(T* loss, T* softmax, const T* src, } } +template +__device__ __forceinline__ void ComputeLoss(T* loss, const T loss_value, + const int label_id, + const int64_t label_value, + const int tid, const int vec_size, + const int offset, + const int ignore_index) { + int loss_id = vec_size * tid + offset; + if (IgnoreIndex) { + if (label_value == loss_id) { + if (label_value == ignore_index) { + loss[label_id] = static_cast(0.0f); + } else { + loss[label_id] = loss_value; + } + } + } else { + if (label_value == loss_id) { + loss[label_id] = loss_value; + } + } +} + +template +__device__ __forceinline__ AccT ThreadReduce(const T* input, int size, + const int offset, AccT init, + ReduceFunctor reducer) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + AccT val = init; + + if (offset > 0) { + input -= offset; + size += offset; + if (tid >= offset) { + val = reducer(val, input[tid]); + } + size -= blockDim.x; + input += blockDim.x; + } + int remain = size % (VecSize * blockDim.x); + + T ins[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); + + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + *ins_vec = reinterpret_cast(input)[tid]; + +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + val = reducer(val, ins[i]); + } + } + + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + val = reducer(val, input[tid]); + } + return val; +} + +template +__device__ __forceinline__ void VectorizedSoftmaxForwardImpl( + T* loss, T* softmax, const T* logits, const LabelT* label, int size, + const int offset, const LogSoftmaxForwardFunctor& func, + const int ignore_index) { + using VecT = kps::details::VectorType; + int tid = threadIdx.x; + int label_id = blockIdx.x; + auto label_value = static_cast(label[label_id]); + const bool label_valid = label_value >= 0 && label_value < size; + int loss_id_offset = 0; + + if (offset > 0) { + logits -= offset; + softmax -= offset; + size += offset; + loss_id_offset -= offset; + if (tid >= offset) { + AccT log_softmax = func(static_cast(logits[tid])); + softmax[tid] = static_cast(std::exp(log_softmax)); + // loss + if (label_valid) { + ComputeLoss(loss, static_cast(-log_softmax), + label_id, label_value, tid, 1, + loss_id_offset, ignore_index); + } + } + size -= blockDim.x; + logits += blockDim.x; + softmax += blockDim.x; + loss_id_offset += blockDim.x; + } + int remain = size % (VecSize * blockDim.x); + + T ins[VecSize]; + T outs[VecSize]; + VecT* ins_vec = reinterpret_cast(&ins); + VecT* outs_vec = reinterpret_cast(&outs); + + // vector part + for (; VecSize * tid < (size - remain); tid += blockDim.x) { + // read + *ins_vec = reinterpret_cast(logits)[tid]; + +#pragma unroll + // compute + for (int i = 0; i < VecSize; ++i) { + AccT log_softmax = func(static_cast(ins[i])); + outs[i] = static_cast(std::exp(log_softmax)); + + // loss + if (label_valid) { + ComputeLoss(loss, static_cast(-log_softmax), + label_id, label_value, tid, VecSize, + loss_id_offset + i, ignore_index); + } + } + + // write + reinterpret_cast(softmax)[tid] = *outs_vec; + } + + // scalar part + tid = size - remain + threadIdx.x; + for (; tid < size; tid += blockDim.x) { + AccT log_softmax = func(static_cast(logits[tid])); + softmax[tid] = static_cast(std::exp(log_softmax)); + + // loss + if (label_valid) { + ComputeLoss(loss, static_cast(-log_softmax), label_id, + label_value, tid, 1, loss_id_offset, + ignore_index); + } + } + + // invalid label, write once + if (!label_valid && threadIdx.x == 0) { + loss[label_id] = static_cast(0.0f); + } +} + +template +__device__ __forceinline__ void ScalarSoftmaxForwardImpl( + T* loss, T* softmax, const T* logits, const LabelT* label, const int size, + const LogSoftmaxForwardFunctor& func, const int ignore_index) { + int tid = threadIdx.x; + int remain = size % (VecSize * blockDim.x); + int label_id = blockIdx.x; + auto label_value = static_cast(label[label_id]); + const bool label_valid = label_value >= 0 && label_value < size; + + // main part + for (; tid < (size - remain); tid += VecSize * blockDim.x) { + T ins[VecSize]; + +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + ins[i] = logits[tid + i * blockDim.x]; + } +#pragma unroll + for (int i = 0; i < VecSize; ++i) { + AccT log_softmax = func(static_cast(ins[i])); + softmax[tid + i * blockDim.x] = static_cast(std::exp(log_softmax)); + // loss + if (label_valid) { + ComputeLoss(loss, static_cast(-log_softmax), + label_id, label_value, tid, VecSize, i, + ignore_index); + } + } + } + + // tail part + for (; tid < size; tid += blockDim.x) { + AccT log_softmax = func(static_cast(logits[tid])); + softmax[tid] = static_cast(std::exp(log_softmax)); + // loss + if (label_valid) { + ComputeLoss(loss, static_cast(-log_softmax), label_id, + label_value, tid, 1, 0, ignore_index); + } + } + + // invalid label, write once + if (!label_valid && threadIdx.x == 0) { + loss[label_id] = static_cast(0.0f); + } +} + +template +__global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, + const LabelT* label, + const int high_dim, const int mid_dim, + const int ignore_index) { + using VecT = kps::details::VectorType; + + // each block deal with one batch + logits += blockIdx.x * mid_dim; + softmax += blockIdx.x * mid_dim; + + const int input_offset = ((uint64_t)logits) % ALIGN_BYTES / sizeof(T); + const int output_offset = ((uint64_t)softmax) % ALIGN_BYTES / sizeof(T); + + // 1. reduce max + AccT max = ThreadReduce>( + logits, mid_dim, input_offset, -std::numeric_limits::infinity(), + kps::MaxFunctor()); + max = kps::details::BlockXReduce>( + max, kps::MaxFunctor()); + + // 2. reduce sum + AccT sum = ThreadReduce>( + logits, mid_dim, input_offset, static_cast(0), + ExpAddFunctor(max)); + sum = kps::details::BlockXReduce>( + sum, kps::AddFunctor()); + + // 3. softmax + LogSoftmaxForwardFunctor func(max, sum); + if (input_offset == output_offset) { + VectorizedSoftmaxForwardImpl( + loss, softmax, logits, label, mid_dim, input_offset, func, + ignore_index); + } else { + ScalarSoftmaxForwardImpl( + loss, softmax, logits, label, mid_dim, func, ignore_index); + } +} + +template +void LaunchVectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, + const LabelT* label, const int high_dim, + const int mid_dim, const int ignore_index, + gpuStream_t stream) { + using AccT = typename details::MPTypeTrait::Type; + constexpr int vec_size = sizeof(float4) / sizeof(T); + const int max_num_threads = 1024; + int max_block_size = std::min(mid_dim / vec_size, max_num_threads); + if (vec_size > 1) { + max_block_size /= 2; + } + + int block_size = 1; + while (block_size < max_block_size) { + block_size *= 2; + } + block_size = std::max(block_size, kps::details::kWarpSize); + dim3 grids(high_dim); + dim3 blocks(block_size); + VectorizedSoftmaxForward<<>>( + loss, softmax, logits, label, high_dim, mid_dim, ignore_index); +} + /* Wrapper of softmax with cross entropy hard label. - - SwitchWarpSoftmaxForward for small size - - cudnn function for large size + - SwitchWarpSoftmaxForward for small size when axis == -1 + - LaunchVectorizedSoftmaxForward for large size when axis == -1 + - cudnn function for axis != -1 */ template static void SoftmaxWithCrossEntropyHardLabel( @@ -431,11 +707,17 @@ static void SoftmaxWithCrossEntropyHardLabel( T* softmax_data, int N, int dim, int D, const int ignore_index) { auto stream = ctx.stream(); constexpr int max_dim = 320; - if (D == 1 && dim <= max_dim) { // small size - const SoftmaxMode mode = SoftmaxMode::kCrossEntropy; - SwitchWarpSoftmaxForward( - loss_data, softmax_data, logits_data, labels_data, N, dim, dim, - ignore_index, stream); + if (D == 1) { + if (dim <= max_dim) { // small size + const SoftmaxMode mode = SoftmaxMode::kCrossEntropy; + SwitchWarpSoftmaxForward( + loss_data, softmax_data, logits_data, labels_data, N, dim, dim, + ignore_index, stream); + } else { // large size + LaunchVectorizedSoftmaxForward( + loss_data, softmax_data, logits_data, labels_data, N, dim, + ignore_index, stream); + } } else { ScopedTensorDescriptor desc; std::vector tensor_dims = {N, dim, D, 1}; From 64f1485a79cb0b9b5a19fddebade0f6e0bf0bb3b Mon Sep 17 00:00:00 2001 From: Zhang Ting Date: Fri, 25 Feb 2022 13:03:07 +0800 Subject: [PATCH 53/85] replace implementation with cuda kernel (#39795) --- paddle/fluid/operators/dropout_impl.cu.h | 37 ++++++++++++++---------- paddle/phi/kernels/funcs/functors.h | 9 ++++-- 2 files changed, 28 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 21fdf69ac570a..2fa956a2e6515 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -36,6 +36,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/phi/kernels/funcs/functors.h" namespace paddle { namespace operators { @@ -270,32 +271,38 @@ void DropoutGradGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, const Tensor& mask, int64_t size, Tensor* grad_x, bool is_test = false) { using MT = typename details::MPTypeTrait::Type; - auto dX = EigenVector::Flatten(*grad_x); - auto dY = EigenVector::Flatten(grad_y); - - auto& place = *dev_ctx.eigen_device(); + auto stream = dev_ctx.stream(); + MT factor; if (is_test) { if (dropout_implementation == "upscale_in_train") { - dX.device(place) = static_cast(1) * dY; + factor = static_cast(1.0f); } else { - dX.device(place) = dY * static_cast(1.0f - dropout_prob); + factor = static_cast(1.0f - dropout_prob); } + std::vector ins = {&grad_y}; + std::vector outs = {grad_x}; + auto functor = phi::funcs::ScaleFunctor(factor); + paddle::operators::LaunchSameDimsElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else { - auto M = EigenVector::Flatten(mask); + std::vector ins = {&grad_y, &mask}; + std::vector outs = {grad_x}; if (dropout_implementation == "upscale_in_train") { if (dropout_prob == 1.0f) { - dX.device(place) = static_cast(0) * dY; +#ifdef PADDLE_WITH_HIP + hipMemset(grad_x->data(), 0, size * sizeof(T)); +#else + cudaMemset(grad_x->data(), 0, size * sizeof(T)); +#endif } else { - auto factor = static_cast(1.0f / (1.0f - dropout_prob)); - auto stream = dev_ctx.stream(); - std::vector ins = {&grad_y, &mask}; - std::vector outs = {grad_x}; - auto functor = CudaDropoutGradFunctor(factor); + factor = static_cast(1.0f / (1.0f - dropout_prob)); paddle::operators::LaunchSameDimsElementwiseCudaKernel( - dev_ctx, ins, &outs, functor); + dev_ctx, ins, &outs, CudaDropoutGradFunctor(factor)); } } else { - dX.device(place) = dY * M.cast(); + factor = static_cast(1.0f); + paddle::operators::LaunchSameDimsElementwiseCudaKernel( + dev_ctx, ins, &outs, CudaDropoutGradFunctor(factor)); } } } diff --git a/paddle/phi/kernels/funcs/functors.h b/paddle/phi/kernels/funcs/functors.h index 5657bb047d7aa..d518a877b26f2 100644 --- a/paddle/phi/kernels/funcs/functors.h +++ b/paddle/phi/kernels/funcs/functors.h @@ -38,12 +38,15 @@ struct AddGradFunctor { template struct ScaleFunctor { - explicit ScaleFunctor(const T coeff) : coeff_(coeff) {} + using MT = typename paddle::operators::details::MPTypeTrait::Type; + explicit ScaleFunctor(const MT coeff) : coeff_(coeff) {} - inline HOSTDEVICE T operator()(T ele) { return ele * coeff_; } + inline HOSTDEVICE T operator()(T ele) { + return static_cast(static_cast(ele) * coeff_); + } private: - T coeff_; + MT coeff_; }; template From 2533cac66efadce1c28789d26c768c15c5b0cb1e Mon Sep 17 00:00:00 2001 From: zn <96479180+kangna-qi@users.noreply.github.com> Date: Fri, 25 Feb 2022 13:28:43 +0800 Subject: [PATCH 54/85] [MLU]support launch process on mlu (#39839) --- python/paddle/distributed/fleet/launch.py | 27 ++++++- .../paddle/distributed/fleet/launch_utils.py | 62 ++++++++++++++- .../fluid/tests/unittests/mlu/CMakeLists.txt | 19 ++++- .../tests/unittests/mlu/multi_process_mlu.py | 77 +++++++++++++++++++ .../tests/unittests/mlu/nproc_process_mlu.py | 38 +++++++++ .../unittests/mlu/test_launch_async_mlu.sh | 59 ++++++++++++++ .../unittests/mlu/test_launch_cloud_mlu.sh | 58 ++++++++++++++ .../unittests/mlu/test_launch_nproc_mlu.sh | 75 ++++++++++++++++++ 8 files changed, 407 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh create mode 100644 python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index 19306d3da9916..0d985a5232517 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -156,6 +156,16 @@ def _parse_args(): ) base_group.add_argument("--selected_npus", dest="npus") + if fluid.core.is_compiled_with_mlu(): + base_group.add_argument( + "--mlus", + type=str, + default=None, + help="It's for mlu training. For example: " + "--mlus=\"0,1,2,3\" will launch four training processes each bound to one mlu." + ) + base_group.add_argument("--selected_mlus", dest="mlus") + base_group.add_argument( "training_script", type=str, @@ -429,6 +439,8 @@ def infer_backend(args): args.backend = 'unknown' elif fluid.core.is_compiled_with_xpu(): args.backend = 'bkcl' + elif fluid.core.is_compiled_with_mlu(): + args.backend = 'cncl' else: args.backend = 'gloo' @@ -472,6 +484,8 @@ def which_distributed_mode(args): accelerators = fluid.core.get_npu_device_count() elif fluid.core.is_compiled_with_xpu(): accelerators = fluid.core.get_xpu_device_count() + elif fluid.core.is_compiled_with_mlu(): + accelerators = fluid.core.get_mlu_device_count() else: accelerators = 0 @@ -490,17 +504,18 @@ def which_distributed_mode(args): return DistributeMode.COLLECTIVE else: if not fluid.core.is_compiled_with_cuda( - ) and not fluid.core.is_compiled_with_xpu(): + ) and not fluid.core.is_compiled_with_xpu( + ) and not fluid.core.is_compiled_with_mlu(): if args.servers: logger.warning( - "Not found distinct arguments and not compiled with cuda or xpu or npu. " + "Not found distinct arguments and not compiled with cuda or xpu or npu or mlu. " "But found args.servers not empty, default use ps mode") return DistributeMode.PS else: return DistributeMode.COLLECTIVE else: logger.warning( - "Not found distinct arguments and compiled with cuda or xpu or npu. " + "Not found distinct arguments and compiled with cuda or xpu or npu or mlu. " "Default use collective mode") return DistributeMode.COLLECTIVE @@ -536,6 +551,10 @@ def launch(): - ``--selected_xpus``: xpus aliases, recommend to use ``--xpus``. + - ``--mlus``: It's for mlu training. e.g., ``--mlus=0,1,2,3`` will launch four training processes each bound to one mlu. + + - ``--selected_mlus``: mlus aliases, recommend to use ``--mlus``. + - ``training_script``: The full path to the single GPU training program/script to be launched in parallel, followed by all the arguments for the training script. e.g., ``traing.py`` - ``training_script_args``: The args of training_script. e.g., ``--lr=0.1`` @@ -688,7 +707,7 @@ def launch(): check_backend(args.backend) distribute_mode = DistributeMode.COLLECTIVE - #assert args.backend in ['gloo', 'nccl', 'bkcl', 'heter', 'unknown'] + #assert args.backend in ['gloo', 'nccl', 'bkcl', 'cncl', 'heter', 'unknown'] if args.backend == 'gloo': logger.warning("launch start with CPUONLY mode") diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index c20c209d60171..2dec58c753853 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -57,6 +57,7 @@ class DeviceMode(): XPU = 2 ASCEND_NPU = 3 UNKNOWN = 3 + MLU = 4 class Cluster(object): @@ -287,7 +288,7 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, ), "current trainer_endpoints size should be greater equal than acclerators size." for i in range(len(devices_per_proc)): trainer = Trainer() - if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU: + if device_mode == DeviceMode.GPU or device_mode == DeviceMode.ASCEND_NPU or device_mode == DeviceMode.MLU: if isinstance(devices_per_proc[i], (list, tuple)): trainer.accelerators.extend(devices_per_proc[i]) pod.accelerators.extend(devices_per_proc[i]) @@ -530,6 +531,9 @@ def start_local_trainers(cluster, accelerators) > 0 and pod.device_mode == DeviceMode.ASCEND_NPU: proc_env["FLAGS_selected_npus"] = "%s" % ",".join( [str(g) for g in t.accelerators]) + elif len(t.accelerators) > 0 and pod.device_mode == DeviceMode.MLU: + proc_env["FLAGS_selected_mlus"] = "%s" % ",".join( + [str(g) for g in t.accelerators]) if len(t.accelerators) > 0: proc_env["FLAGS_selected_accelerators"] = "%s" % ",".join( @@ -735,6 +739,35 @@ def get_npus(npus): return res_npus +def get_mlus(mlus): + if mlus is None: + mlus_num = fluid.core.get_mlu_device_count() + res_mlus = [str(x) for x in range(0, mlus_num)] + else: + mlu_visible_devices = os.getenv("MLU_VISIBLE_DEVICES") + if mlu_visible_devices is None or mlu_visible_devices == "": + res_mlus = [x.strip() for x in mlus.split(',')] + else: + # change mlus into relative values + # e.g. MLU_VISIBLE_DEVICES=4,5,6,7; args.mlus=4,5,6,7; + # therefore mlus=0,1,2,3 + mlu_visible_devices_list = mlu_visible_devices.split(',') + for x in mlus.split(','): + assert x in mlu_visible_devices_list, "Can't find "\ + "your mlus %s in MLU_VISIBLE_DEVICES[%s]."\ + % (x, mlu_visible_devices) + res_mlus = [ + mlu_visible_devices_list.index(x.strip()) + for x in mlus.split(',') + ] + logger.info("Change selected_mlus into reletive values. --ips:{} " + "will change into relative_ips:{} according to your " + "MLU_VISIBLE_DEVICES:{}".format( + mlus, res_mlus, mlu_visible_devices_list)) + + return res_mlus + + def get_device_mode(backend): if backend == 'heter': if fluid.core.is_compiled_with_cuda() and \ @@ -763,6 +796,10 @@ def get_device_mode(backend): print("launch train in XPU mode") return DeviceMode.XPU + if backend == 'cncl' and fluid.core.get_mlu_device_count() > 0: + print("launch train in MLU mode") + return DeviceMode.MLU + if backend == 'gloo': print("launch train in CPU mode") return DeviceMode.CPU @@ -812,6 +849,18 @@ def get_device_proc_info(args): ] else: devices_per_proc = xpus + elif device_mode == DeviceMode.MLU: + mlus = get_mlus(args.mlus) + if args.nproc_per_node is not None: + assert (len(mlus) % int(args.nproc_per_node)) ==0, \ + "mlus' number:{} mod args.nproc_per_node:{} must == 0".format(len(mlus), args.nproc_per_node) + + n = int(len(mlus) / int(args.nproc_per_node)) + devices_per_proc = [ + mlus[i:i + n] for i in six.moves.range(0, len(mlus), n) + ] + else: + devices_per_proc = mlus elif device_mode == DeviceMode.CPU: if hasattr(args, "paddle_cpuonly") and args.nproc_per_node is None: #NOTE (xiongkun03) set it to cpu core number @@ -1719,7 +1768,7 @@ def start_pod_heter_worker(self, args, pod): def check_backend(backend): - if backend not in ['nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter']: + if backend not in ['nccl', 'gloo', 'bkcl', 'cncl', 'auto', 'hccl', 'heter']: raise ValueError("paddle.distributed initialize error, " "backend argument can only be one of " "'nccl', 'gloo', 'bkcl', 'auto', 'hccl', 'heter' " @@ -1743,6 +1792,12 @@ def check_backend(backend): "your paddle is not compiled with npu but you assign 'hccl' as backend." ) + if backend == 'cncl' and not fluid.core.is_compiled_with_mlu(): + raise ValueError( + "paddle.distributed initialize error, " + "your paddle is not compiled with mlu but you assign 'cncl' as backend." + ) + def block_windows_and_macos(backend): if backend != 'gloo': return @@ -1766,4 +1821,7 @@ def get_backend_by_compile_flag(): if fluid.core.is_compiled_with_npu(): return 'hccl' + if fluid.core.is_compiled_with_mlu(): + return 'cncl' + return 'gloo' diff --git a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt index 41f3a31017e7f..c17790bd3200e 100644 --- a/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/mlu/CMakeLists.txt @@ -1,10 +1,25 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +file(GLOB TEST_DIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_collective_*.py") +string(REPLACE ".py" "" TEST_DIST_OPS "${TEST_DIST_OPS}") if (WITH_MLU) + foreach(TEST_OP ${TEST_DIST_OPS}) + LIST(REMOVE_ITEM TEST_OPS ${TEST_OP}) + endforeach(TEST_OP) + foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) - set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) - set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + + if(WITH_CNCL) + foreach(TEST_OP ${TEST_DIST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP}) + endforeach(TEST_OP) + bash_test_modules(test_launch_async_mlu START_BASH test_launch_async_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_cloud_mlu START_BASH test_launch_cloud_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + bash_test_modules(test_launch_nproc_mlu START_BASH test_launch_nproc_mlu.sh ENVS PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}) + set_tests_properties(test_collective_broadcast PROPERTIES TIMEOUT 120) + set_tests_properties(test_collective_allreduce PROPERTIES TIMEOUT 120) + endif(WITH_CNCL) endif() diff --git a/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py new file mode 100644 index 0000000000000..9ea550a8452e4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/multi_process_mlu.py @@ -0,0 +1,77 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time +import paddle.fluid as fluid + + +def train(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + + +def train_abort(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + if trainer_id == 0: + try: + # train abort + exit(1) + except SystemExit: + name = "abort>>> selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + print(name) + with open( + "multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + raise + else: + # sleep 30s to make sure paddle.distributed.launch will terminate this process + time.sleep(30) + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), + "w") as f: + f.write(name) + + +if __name__ == '__main__': + if len(sys.argv) == 3 and sys.argv[2] == "abort": + prefix = sys.argv[1] + train_abort(prefix) + else: + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py new file mode 100644 index 0000000000000..9b2713532e41b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/nproc_process_mlu.py @@ -0,0 +1,38 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import time + + +def train(prefix): + selected_mlus = os.getenv("FLAGS_selected_mlus") + trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) + worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS") + current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") + worker_endpoints = worker_endpoints_env + trainers_num = len(worker_endpoints.split(',')) + + name = "selected_mlus:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ + .format(selected_mlus, worker_endpoints, trainers_num, current_endpoint,trainer_id) + + print(name) + with open("{}.check_{}.log".format(prefix, trainer_id), "w") as f: + f.write(name) + + +if __name__ == '__main__': + prefix = sys.argv[1] + train(prefix) diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh new file mode 100644 index 0000000000000..adf3019186163 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_async_mlu.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e + +# test use DISTRIBUTED_TRAINER_ENDPOINTS env in paddlecloud +unset PADDLE_PORT +export DISTRIBUTED_TRAINER_ENDPOINTS=127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 +export cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export TRAINER_PORTS_NUM=2 + +file_0="multi_process_fullpath_launch.check_0.log" +file_1="multi_process_fullpath_launch.check_1.log" + +distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog" + +echo "paddle.distributed.fleet.launch async poll process test" +if ! MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fullpath_launch abort; then + echo "train abort as planned" +fi + +abort_str1="abort>>> selected_mlus:0 worker_endpoints:127.0.0.1:6170,127.0.0.1:6171,127.0.0.2:6170,127.0.0.2:6171 trainers_num:4 current_endpoint:127.0.0.1:6170 trainer_id:0" + +if grep -q "$abort_str1" "$file_0"; then + echo "trainer 0 abort as planned" +else + echo "trainer 0 not abort as planned" + exit -1 +fi + +if [ ! -f $file_1 ]; then + echo "trainer 1 terminate as planned" +else + echo "trainer 1 not terminate as planned" + rm $file_1 + exit -1 +fi + +if [ -f $file_0 ]; then + rm $file_0 +fi diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh new file mode 100644 index 0000000000000..b93b21c1bdf68 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_cloud_mlu.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +set -e + +# use paddlecloud +echo "begin test use paddlecloud" +cluster_node_ips="127.0.0.1,127.0.0.2" +export PADDLE_TRAINERS_NUM=2 +export POD_IP=127.0.0.1 +export PADDLE_TRAINERS=127.0.0.1,127.0.0.2 +export PADDLE_TRAINER_ID=0 + +export PADDLE_PORT=35789 +export TRAINER_PORTS_NUM=2 + +distributed_args="--ips=${cluster_node_ips} --mlus=0,1 --log_dir=testlog" +MLU_VISIBLE_DEVICES=0,1 python -m paddle.distributed.fleet.launch ${distributed_args} multi_process_mlu.py fleetlaunchcloud + +str1="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" +str2="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +file_0="multi_process_fleetlaunchcloud.check_0.log" +file_1="multi_process_fleetlaunchcloud.check_1.log" + +echo "paddlecloud params test" +if grep -q "$str1" "$file_0"; then + echo "find trainer 0" +else + echo "not find trainer 0" + exit -1 +fi + +if grep -q "$str2" "$file_1"; then + echo "find trainer 1" +else + echo "not find trainer 1" + exit -1 +fi + +# test async poll process +if [ -f $file_0 ]; then + rm $file_0 +fi +if [ -f $file_1 ]; then + rm $file_1 +fi diff --git a/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh new file mode 100644 index 0000000000000..722590dc87f09 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mlu/test_launch_nproc_mlu.sh @@ -0,0 +1,75 @@ +#!/bin/bash + +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +set -e +export FLAGS_START_PORT=35789 + +export MLU_VISIBLE_DEVICES=0,1 + +function test_nproc_0(){ + mlus=$1 + file_0="fleet_nproc_0.check_0.log" + rm -f ${file_0} + distributed_args="--log_dir=testlog --nproc_per_node=1" + python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py fleet_nproc_0 + + str0="selected_mlus:${mlus} worker_endpoints:127.0.0.1:35789 trainers_num:1 current_endpoint:127.0.0.1:35789 trainer_id:0" + if grep -q "$str0" "$file_0"; then + echo "find trainer 0" + else + echo "not find trainer 0" + exit -1 + fi + if [ -f $file_0 ]; then + rm $file_0 + fi +} + + +function test_nproc_1(){ + file_0="fleet_nproc_1.check_0.log" + file_1="fleet_nproc_1.check_1.log" + rm -f ${file_0} ${file_1} + + distributed_args="--log_dir=testlog --nproc_per_node=2" + python -m paddle.distributed.launch ${distributed_args} nproc_process_mlu.py fleet_nproc_1 + + str0="selected_mlus:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35789 trainer_id:0" + if grep -q "$str0" "$file_0"; then + echo "find trainer 0" + else + echo "not find trainer 0" + exit -1 + fi + + str1="selected_mlus:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790 trainers_num:2 current_endpoint:127.0.0.1:35790 trainer_id:1" + if grep -q "$str1" "$file_1"; then + echo "find trainer 1" + else + echo "not find trainer 1" + exit -1 + fi + if [ -f $file_0 ]; then + rm $file_0 + fi + if [ -f $file_1 ]; then + rm $file_1 + fi +} + +test_nproc_0 "0,1" + +test_nproc_1 From 783c4aba035542e66f187b1d504d49f8154f2a3f Mon Sep 17 00:00:00 2001 From: Linjie Chen <40840292+linjieccc@users.noreply.github.com> Date: Fri, 25 Feb 2022 14:46:43 +0800 Subject: [PATCH 55/85] move diag_v2 to phi (#39914) --- paddle/fluid/operators/diag_v2_op.cc | 96 ++--------------- paddle/fluid/operators/diag_v2_op.cu | 128 ---------------------- paddle/fluid/operators/diag_v2_op.h | 34 ------ paddle/phi/core/compat/op_utils.h | 3 +- paddle/phi/infermeta/binary.cc | 1 + paddle/phi/infermeta/unary.cc | 40 +++++++ paddle/phi/infermeta/unary.h | 5 + paddle/phi/kernels/cpu/diag_kernel.cc | 66 ++++++++++++ paddle/phi/kernels/diag_kernel.h | 28 +++++ paddle/phi/kernels/funcs/diag_functor.h | 29 +++++ paddle/phi/kernels/gpu/diag_kernel.cu | 134 ++++++++++++++++++++++++ paddle/phi/ops/compat/diag_sig.cc | 27 +++++ 12 files changed, 340 insertions(+), 251 deletions(-) delete mode 100644 paddle/fluid/operators/diag_v2_op.cu delete mode 100644 paddle/fluid/operators/diag_v2_op.h create mode 100644 paddle/phi/kernels/cpu/diag_kernel.cc create mode 100644 paddle/phi/kernels/diag_kernel.h create mode 100644 paddle/phi/kernels/funcs/diag_functor.h create mode 100644 paddle/phi/kernels/gpu/diag_kernel.cu create mode 100644 paddle/phi/ops/compat/diag_sig.cc diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc index 30ea323733238..0160277dc79af 100644 --- a/paddle/fluid/operators/diag_v2_op.cc +++ b/paddle/fluid/operators/diag_v2_op.cc @@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/diag_v2_op.h" #include + +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/infermeta/unary.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { @@ -23,44 +25,6 @@ namespace operators { class DiagV2Op : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "diag_v2"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "diag_v2"); - - auto x_dims = ctx->GetInputDim("X"); - auto offset = ctx->Attrs().Get("offset"); - - if (x_dims.size() == 1UL) { - int64_t size_ = x_dims[0] + std::abs(offset); - ctx->SetOutputDim("Out", {size_, size_}); - } else if (x_dims.size() == 2UL) { - int64_t size_ = 0; - if (offset >= 0) { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] < x_dims[1] - offset) { - size_ = x_dims[0]; - } else { - size_ = x_dims[1] - offset; - } - } else { - // Note(LutaoChu): Do not use std::min here, otherwise the calculation - // of `size_` will have unexpected result on Windows Python3.8 - if (x_dims[0] + offset < x_dims[1]) { - size_ = x_dims[0] + offset; - } else { - size_ = x_dims[1]; - } - } - ctx->SetOutputDim("Out", {size_}); - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "The input tensor X's dimensions of DiagV2Op should be either 1 or " - "2, but received %d.", - x_dims.size())); - } - } }; class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { @@ -94,59 +58,15 @@ class DiagV2OpMaker : public framework::OpProtoAndCheckerMaker { } }; -template -class DiagV2Kernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* x_data = X->data(); - auto x_dims = X->dims(); - int offset = context.Attr("offset"); - auto* out = context.Output("Out"); - T* out_data = out->mutable_data(context.GetPlace()); - auto out_dims = out->dims(); - - int64_t i; - if (x_dims.size() == 1) { - float padding_value = context.Attr("padding_value"); - phi::funcs::SetConstant set_padding_value; - auto& dev_ctx = context.template device_context(); - set_padding_value(dev_ctx, out, static_cast(padding_value)); - - auto x_length = x_dims[0]; - const int& x_stride = ComputeStride(0, x_dims); - - auto out_stride_0 = ComputeStride(0, out_dims); - auto out_stride_1 = ComputeStride(1, out_dims); - out_data += - (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0); - - for (i = 0; i < x_length; i++) { - out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride]; - } - } else { - auto out_length = out_dims[0]; - const int& x_stride_0 = ComputeStride(0, x_dims); - const int& x_stride_1 = ComputeStride(1, x_dims); - - auto out_stride_0 = ComputeStride(0, out_dims); - x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0); - for (i = 0; i < out_length; i++) { - out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)]; - } - } - } -}; } // namespace operators } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(diag_v2, DiagInferShapeFunctor, + PT_INFER_META(phi::DiagInferMeta)); + REGISTER_OPERATOR( diag_v2, ops::DiagV2Op, ops::DiagV2OpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL( - diag_v2, ops::DiagV2Kernel, - ops::DiagV2Kernel, - ops::DiagV2Kernel, - ops::DiagV2Kernel); + paddle::framework::EmptyGradOpMaker, + DiagInferShapeFunctor); diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu deleted file mode 100644 index 9b83b68bea159..0000000000000 --- a/paddle/fluid/operators/diag_v2_op.cu +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/diag_v2_op.h" - -namespace paddle { -namespace operators { - -// Extract the diagonal of a matrix 'x' to a vector 'out'. -template -__global__ void ExtractDiagonalKernel(T* out, const T* x, std::ptrdiff_t start, - std::ptrdiff_t size, - const std::ptrdiff_t sumStride, - const std::ptrdiff_t outStride) { - for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; - idx += gridDim.x * blockDim.x) { - const std::ptrdiff_t xOffset = start + sumStride * idx; - out[outStride * idx] = x[xOffset]; - } -} - -// Paste a vector 'x' to the diagonal of a matrix 'out' -template -__global__ void PasteDiagonalKernel(T* out, const T* x, std::ptrdiff_t start, - std::ptrdiff_t x_length, - const std::ptrdiff_t sumStride, - const std::ptrdiff_t xStride) { - for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; - idx < x_length; idx += gridDim.x * blockDim.x) { - const std::ptrdiff_t outOffset = start + sumStride * idx; - out[outOffset] = x[xStride * idx]; - } -} - -template -class DiagV2CUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* x_data = X->data(); - auto x_dims = X->dims(); - int offset = context.Attr("offset"); - auto* out = context.Output("Out"); - T* out_data = out->mutable_data(context.GetPlace()); - auto out_dims = out->dims(); - auto& dev_ctx = context.template device_context(); - - auto GetBlockGridSize = [&dev_ctx](int64_t size) { - const int64_t block_size = - std::min(size, static_cast(dev_ctx.GetMaxThreadsPerBlock())); - int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int64_t max_blocks = std::max(((max_threads - 1) / block_size + 1), - static_cast(1)); - const int64_t grid_size = - std::min(max_blocks, (size + block_size - 1) / block_size); - return std::tuple{block_size, grid_size}; - }; - - if (x_dims.size() == 1) { - float padding_value = context.Attr("padding_value"); - phi::funcs::SetConstant set_padding_value; - set_padding_value(dev_ctx, out, static_cast(padding_value)); - - auto x_length = x_dims[0]; - auto size = (offset > 0) ? x_length + offset : x_length - offset; - const int& x_stride = ComputeStride(0, x_dims); - if (size > 0) { - const auto& out_stride_0 = ComputeStride(0, out_dims); - const auto& out_stride_1 = ComputeStride(1, out_dims); - auto start = - (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0); - - std::tuple block_grid_size = GetBlockGridSize(size); - - PasteDiagonalKernel< - T><<(block_grid_size), std::get<0>(block_grid_size), 0, - dev_ctx.stream()>>>(out_data, x_data, start, x_length, - out_stride_0 + out_stride_1, x_stride); - } - } else { - const int& x_stride_0 = ComputeStride(0, x_dims); - const int& x_stride_1 = ComputeStride(1, x_dims); - - int64_t size; - if (offset > 0) { - size = std::min(x_dims[0], x_dims[1] - offset); - } else { - size = std::min(x_dims[0] + offset, x_dims[1]); - } - - if (size > 0) { - auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0); - const auto& out_stride_0 = ComputeStride(0, out_dims); - - std::tuple block_grid_size = GetBlockGridSize(size); - - ExtractDiagonalKernel< - T><<(block_grid_size), std::get<0>(block_grid_size), 0, - dev_ctx.stream()>>>(out_data, x_data, start, size, - x_stride_0 + x_stride_1, out_stride_0); - } - } - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - diag_v2, ops::DiagV2CUDAKernel, - ops::DiagV2CUDAKernel, - ops::DiagV2CUDAKernel, - ops::DiagV2CUDAKernel); diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h deleted file mode 100644 index f0bf04badab79..0000000000000 --- a/paddle/fluid/operators/diag_v2_op.h +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using DDim = framework::DDim; - -static inline int ComputeStride(int axis, DDim dims) { - int size = 1; - for (int i = axis + 1; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h index ec810d4e16340..bbf634b4b09b9 100644 --- a/paddle/phi/core/compat/op_utils.h +++ b/paddle/phi/core/compat/op_utils.h @@ -37,7 +37,8 @@ const std::unordered_set standard_kernel_suffixs({ * after 2.0, and can no longer be occupied by the previously abandoned ops. * They are marked here uniformly. */ -const std::unordered_set deprecated_op_names({"flatten", +const std::unordered_set deprecated_op_names({"diag", + "flatten", "flatten_grad", "matmul", "matmul_grad", diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 58cd43998b8a5..dfaabf7cae21e 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -310,6 +310,7 @@ void BCELossInferMeta(const MetaTensor& input, } out->set_dims(input_dims); + out->set_dtype(input.dtype()); out->share_lod(input); } diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index ca71d6a56d8e7..72b88f537faf2 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/phi/infermeta/unary.h" +#include #include #include "paddle/phi/common/data_type.h" #include "paddle/phi/core/enforce.h" @@ -715,6 +716,45 @@ void UnfoldInferMeta(const MetaTensor& x, out->set_dims(phi::make_ddim(out_dims)); } +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out) { + auto x_dims = x.dims(); + + if (x_dims.size() == 1UL) { + int64_t size_ = x_dims[0] + std::abs(offset); + out->set_dims({size_, size_}); + out->set_dtype(x.dtype()); + } else if (x_dims.size() == 2UL) { + int64_t size_ = 0; + if (offset >= 0) { + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] < x_dims[1] - offset) { + size_ = x_dims[0]; + } else { + size_ = x_dims[1] - offset; + } + } else { + // Note(LutaoChu): Do not use std::min here, otherwise the calculation + // of `size_` will have unexpected result on Windows Python3.8 + if (x_dims[0] + offset < x_dims[1]) { + size_ = x_dims[0] + offset; + } else { + size_ = x_dims[1]; + } + } + out->set_dims({size_}); + out->set_dtype(x.dtype()); + } else { + PADDLE_THROW(phi::errors::InvalidArgument( + "The input tensor X's dimensions of DiagV2Op should be either 1 or " + "2, but received %d.", + x_dims.size())); + } +} + } // namespace phi PD_REGISTER_INFER_META_FN(copy_to, phi::CopyToInferMeta); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 21cbe76bb13c0..1a1605bb1ce4a 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -104,4 +104,9 @@ void UnfoldInferMeta(const MetaTensor& x, MetaTensor* out, MetaConfig config = MetaConfig()); +void DiagInferMeta(const MetaTensor& x, + int offset, + float padding_value, + MetaTensor* out); + } // namespace phi diff --git a/paddle/phi/kernels/cpu/diag_kernel.cc b/paddle/phi/kernels/cpu/diag_kernel.cc new file mode 100644 index 0000000000000..d1e0b8e31e78f --- /dev/null +++ b/paddle/phi/kernels/cpu/diag_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_kernel.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void DiagKernel(const Context& dev_ctx, + const DenseTensor& x, + int offset, + float padding_value, + DenseTensor* out) { + auto* x_data = x.data(); + auto x_dims = x.dims(); + T* out_data = dev_ctx.template Alloc(out); + auto out_dims = out->dims(); + + int64_t i; + if (x_dims.size() == 1) { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, out, static_cast(padding_value)); + + auto x_length = x_dims[0]; + const int& x_stride = phi::funcs::ComputeStride(0, x_dims); + + auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims); + auto out_stride_1 = phi::funcs::ComputeStride(1, out_dims); + out_data += (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0); + + for (i = 0; i < x_length; i++) { + out_data[i * (out_stride_0 + out_stride_1)] = x_data[i * x_stride]; + } + } else { + auto out_length = out_dims[0]; + const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims); + const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims); + + auto out_stride_0 = phi::funcs::ComputeStride(0, out_dims); + x_data += (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0); + for (i = 0; i < out_length; i++) { + out_data[i * out_stride_0] = x_data[i * (x_stride_0 + x_stride_1)]; + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + diag, CPU, ALL_LAYOUT, phi::DiagKernel, int, float, double, int64_t) {} diff --git a/paddle/phi/kernels/diag_kernel.h b/paddle/phi/kernels/diag_kernel.h new file mode 100644 index 0000000000000..8dc919fa63360 --- /dev/null +++ b/paddle/phi/kernels/diag_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void DiagKernel(const Context& dev_ctx, + const DenseTensor& x, + int offset, + float padding_value, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/funcs/diag_functor.h b/paddle/phi/kernels/funcs/diag_functor.h new file mode 100644 index 0000000000000..a806d1583a0b3 --- /dev/null +++ b/paddle/phi/kernels/funcs/diag_functor.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace phi { +namespace funcs { + +inline int ComputeStride(int axis, phi::DDim dims) { + int size = 1; + for (int i = axis + 1; i < dims.size(); i++) { + size *= dims[i]; + } + return size; +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/diag_kernel.cu b/paddle/phi/kernels/gpu/diag_kernel.cu new file mode 100644 index 0000000000000..fc70639787173 --- /dev/null +++ b/paddle/phi/kernels/gpu/diag_kernel.cu @@ -0,0 +1,134 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/diag_kernel.h" + +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/diag_functor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +// Extract the diagonal of a matrix 'x' to a vector 'out'. +template +__global__ void ExtractDiagonalKernel(T* out, + const T* x, + std::ptrdiff_t start, + std::ptrdiff_t size, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t outStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; idx < size; + idx += gridDim.x * blockDim.x) { + const std::ptrdiff_t xOffset = start + sumStride * idx; + out[outStride * idx] = x[xOffset]; + } +} + +// Paste a vector 'x' to the diagonal of a matrix 'out' +template +__global__ void PasteDiagonalKernel(T* out, + const T* x, + std::ptrdiff_t start, + std::ptrdiff_t x_length, + const std::ptrdiff_t sumStride, + const std::ptrdiff_t xStride) { + for (std::ptrdiff_t idx = blockIdx.x * blockDim.x + threadIdx.x; + idx < x_length; + idx += gridDim.x * blockDim.x) { + const std::ptrdiff_t outOffset = start + sumStride * idx; + out[outOffset] = x[xStride * idx]; + } +} + +template +void DiagKernel(const Context& dev_ctx, + const DenseTensor& x, + int offset, + float padding_value, + DenseTensor* out) { + auto* x_data = x.data(); + auto x_dims = x.dims(); + T* out_data = dev_ctx.template Alloc(out); + auto out_dims = out->dims(); + + auto GetBlockGridSize = [&dev_ctx](int64_t size) { + const int64_t block_size = + std::min(size, static_cast(dev_ctx.GetMaxThreadsPerBlock())); + int64_t max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int64_t max_blocks = + std::max(((max_threads - 1) / block_size + 1), static_cast(1)); + const int64_t grid_size = + std::min(max_blocks, (size + block_size - 1) / block_size); + return std::tuple{block_size, grid_size}; + }; + + if (x_dims.size() == 1) { + phi::funcs::SetConstant set_padding_value; + set_padding_value(dev_ctx, out, static_cast(padding_value)); + + auto x_length = x_dims[0]; + auto size = (offset > 0) ? x_length + offset : x_length - offset; + const int& x_stride = phi::funcs::ComputeStride(0, x_dims); + if (size > 0) { + const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims); + const auto& out_stride_1 = phi::funcs::ComputeStride(1, out_dims); + auto start = + (offset >= 0 ? offset * out_stride_1 : -offset * out_stride_0); + + std::tuple block_grid_size = GetBlockGridSize(size); + + PasteDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>(out_data, + x_data, + start, + x_length, + out_stride_0 + out_stride_1, + x_stride); + } + } else { + const int& x_stride_0 = phi::funcs::ComputeStride(0, x_dims); + const int& x_stride_1 = phi::funcs::ComputeStride(1, x_dims); + + int64_t size; + if (offset > 0) { + size = std::min(x_dims[0], x_dims[1] - offset); + } else { + size = std::min(x_dims[0] + offset, x_dims[1]); + } + + if (size > 0) { + auto start = (offset >= 0 ? offset * x_stride_1 : -offset * x_stride_0); + const auto& out_stride_0 = phi::funcs::ComputeStride(0, out_dims); + + std::tuple block_grid_size = GetBlockGridSize(size); + + ExtractDiagonalKernel<<(block_grid_size), + std::get<0>(block_grid_size), + 0, + dev_ctx.stream()>>>( + out_data, x_data, start, size, x_stride_0 + x_stride_1, out_stride_0); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + diag, GPU, ALL_LAYOUT, phi::DiagKernel, int, int64_t, float, double) {} diff --git a/paddle/phi/ops/compat/diag_sig.cc b/paddle/phi/ops/compat/diag_sig.cc new file mode 100644 index 0000000000000..0a14b9095c834 --- /dev/null +++ b/paddle/phi/ops/compat/diag_sig.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature DiagOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("diag", {"X"}, {"offset", "padding_value"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_BASE_KERNEL_NAME(diag_v2, diag); + +PD_REGISTER_ARG_MAPPING_FN(diag_v2, phi::DiagOpArgumentMapping); From 37cb6f32b23025445374fc4da824eab0f4598eac Mon Sep 17 00:00:00 2001 From: Qi Li Date: Fri, 25 Feb 2022 15:07:16 +0800 Subject: [PATCH 56/85] [ROCm] fix Managed Memory Alloc on HIP, test=develop (#39896) * [ROCm] fix Managed Memory Alloc on HIP, test=develop * update, test=develop --- paddle/fluid/memory/allocation/allocator_facade.cc | 3 ++- paddle/fluid/memory/cuda_managed_memory_test.cu | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 9b2aaa9308e5d..4d0e485285146 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -493,7 +493,8 @@ class AllocatorFacadePrivate { "support allocating managed memory.\n" "If you don't actually need to use managed memory, please disable " "it with command `export FLAGS_use_cuda_managed_memory=false`.\n" - "Or you must use the gpu device that supports managed memory.")); + "Or you must use the gpu device that supports managed memory.", + p.device)); } return std::make_shared(p); } diff --git a/paddle/fluid/memory/cuda_managed_memory_test.cu b/paddle/fluid/memory/cuda_managed_memory_test.cu index 4243c5fa90f7f..f8c9ff82f5712 100644 --- a/paddle/fluid/memory/cuda_managed_memory_test.cu +++ b/paddle/fluid/memory/cuda_managed_memory_test.cu @@ -128,6 +128,9 @@ TEST(ManagedMemoryTest, OversubscribeGPUMemoryTest) { } TEST(ManagedMemoryTest, OOMExceptionTest) { + if (!platform::IsGPUManagedMemorySupported(0)) { + return; + } EXPECT_THROW(Alloc(platform::CUDAPlace(0), size_t(1) << 60), memory::allocation::BadAlloc); } From 87b903a3e88df368dfc0602eb689580f8fc8284e Mon Sep 17 00:00:00 2001 From: Aganlengzi Date: Fri, 25 Feb 2022 15:19:09 +0800 Subject: [PATCH 57/85] [phi]migrate increment addmm multinomial cholesky InferShapes to phi (#39913) * [phi]migrate increment addmm multinomial cholesky InferShapes to phi * set_dtype and mod MultinomialFunctor --- paddle/fluid/operators/addmm_op.cc | 88 +------------ paddle/fluid/operators/cholesky_op.cc | 28 +--- paddle/fluid/operators/increment_op.cc | 20 +-- paddle/fluid/operators/multinomial_op.cc | 42 +----- paddle/phi/infermeta/CMakeLists.txt | 2 +- paddle/phi/infermeta/ternary.cc | 92 +++++++++++++ paddle/phi/infermeta/ternary.h | 40 ++++++ paddle/phi/infermeta/unary.cc | 69 ++++++++++ paddle/phi/infermeta/unary.h | 9 ++ paddle/phi/kernels/cholesky_grad_kernel.h | 2 +- paddle/phi/kernels/cpu/multinomial_kernel.cc | 14 +- .../phi/kernels/funcs/multinomial_functor.h | 122 ++++++++++++++++++ paddle/phi/kernels/gpu/multinomial_kernel.cu | 16 ++- paddle/phi/kernels/multinomial_kernel.h | 103 +-------------- paddle/phi/ops/compat/addmm_sig.cc | 6 - paddle/phi/ops/compat/cholesky_sig.cc | 5 - 16 files changed, 383 insertions(+), 275 deletions(-) create mode 100644 paddle/phi/infermeta/ternary.cc create mode 100644 paddle/phi/infermeta/ternary.h create mode 100644 paddle/phi/kernels/funcs/multinomial_functor.h diff --git a/paddle/fluid/operators/addmm_op.cc b/paddle/fluid/operators/addmm_op.cc index 863e64c686d7b..de4d7818020dd 100644 --- a/paddle/fluid/operators/addmm_op.cc +++ b/paddle/fluid/operators/addmm_op.cc @@ -16,7 +16,10 @@ limitations under the License. */ #include #include #include +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/ternary.h" #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -33,85 +36,6 @@ class AddMMOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ(ctx->HasInput("Input"), true, - platform::errors::NotFound( - "Input(Input) of AddMMOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::NotFound("Input(X) of AddMMOp should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Y"), true, - platform::errors::NotFound("Input(Y) of AddMMOp should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, - platform::errors::NotFound( - "Output(Out) of AddMMOp should not be null.")); - - auto input_dims = ctx->GetInputDim("Input"); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - - auto ndim_input = input_dims.size(); - auto ndim_x = x_dims.size(); - auto ndim_y = y_dims.size(); - - float alpha = ctx->Attrs().Get("Alpha"); - float beta = ctx->Attrs().Get("Beta"); - - VLOG(3) << "addmm operator input.shape=" << input_dims - << " x.shape=" << x_dims << " y.shape=" << y_dims - << " beta=" << beta << " alpha=" << alpha - << " ndim_input=" << ndim_input << " ndim_x=" << ndim_x - << " ndim_y=" << ndim_y; - - PADDLE_ENFORCE_NE(phi::product(input_dims), 0, - platform::errors::PreconditionNotMet( - "The Input variable Input(%s) has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - ctx->Inputs("Input").front())); - - PADDLE_ENFORCE_NE(phi::product(x_dims), 0, - platform::errors::PreconditionNotMet( - "The Input variable X(%s) has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - ctx->Inputs("X").front())); - - PADDLE_ENFORCE_NE(phi::product(y_dims), 0, - platform::errors::PreconditionNotMet( - "The Input variable Y(%s) has not " - "been initialized. You may need to confirm " - "if you put exe.run(startup_program) " - "after optimizer.minimize function.", - ctx->Inputs("Y").front())); - // dim check - PADDLE_ENFORCE_EQ(ndim_input, 2, - platform::errors::InvalidArgument( - "The input tensor input's dimension must be 2. " - "But received input's dimension = [%s].", - ndim_input)); - PADDLE_ENFORCE_EQ(ndim_x, 2, - platform::errors::InvalidArgument( - "The input tensor x's dimension must be 2. " - "But received x's dimension = [%s].", - ndim_x)); - PADDLE_ENFORCE_EQ(ndim_y, 2, - platform::errors::InvalidArgument( - "The input tensor y's dimension must be 2. " - "But received y's dimension = [%s].", - ndim_y)); - - std::vector output_dims; - output_dims.push_back(x_dims[0]); - output_dims.push_back(y_dims[1]); - - ctx->SetOutputDim("Out", phi::make_ddim(output_dims)); - ctx->ShareLoD("Input", /*->*/ "Out"); - } - framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const { framework::LibraryType library = framework::LibraryType::kPlain; @@ -223,9 +147,11 @@ class AddMMOpGradMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; - +DELCARE_INFER_SHAPE_FUNCTOR(addmm, AddmmInferShapeFunctor, + PT_INFER_META(phi::AddmmInferMeta)); REGISTER_OPERATOR(addmm, ops::AddMMOp, ops::AddMMOpMaker, ops::AddMMOpGradMaker, - ops::AddMMOpGradMaker); + ops::AddMMOpGradMaker, + AddmmInferShapeFunctor); REGISTER_OPERATOR(addmm_grad, ops::AddMMGradOp); diff --git a/paddle/fluid/operators/cholesky_op.cc b/paddle/fluid/operators/cholesky_op.cc index 93dee0df7b954..09e915a6bafd4 100644 --- a/paddle/fluid/operators/cholesky_op.cc +++ b/paddle/fluid/operators/cholesky_op.cc @@ -12,7 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,26 +26,6 @@ using framework::Tensor; class CholeskyOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Cholesky"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Cholesky"); - auto dims = ctx->GetInputDim("X"); - auto rank = dims.size(); - PADDLE_ENFORCE_GE(rank, 2, - platform::errors::InvalidArgument( - "The Input(X) should have at least 2 dimensions. But " - "received a %d dimension tensor.", - rank)); - PADDLE_ENFORCE_EQ( - dims[rank - 2], dims[rank - 1], - platform::errors::InvalidArgument( - "The inner-most 2 dimensions of Input(X) all should be symmetric " - "positive-definite matrices and have the same size. But received " - "X's shape[-2] = %d and shape[-1] = %d.", - dims[rank - 2], dims[rank - 1])); - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } }; class CholeskyOpMaker : public framework::OpProtoAndCheckerMaker { @@ -107,7 +90,10 @@ class CholeskyGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(cholesky, CholeskyInferShapeFunctor, + PT_INFER_META(phi::CholeskyInferMeta)); REGISTER_OPERATOR(cholesky, ops::CholeskyOp, ops::CholeskyOpMaker, ops::CholeskyGradOpMaker, - ops::CholeskyGradOpMaker); + ops::CholeskyGradOpMaker, + CholeskyInferShapeFunctor); REGISTER_OPERATOR(cholesky_grad, ops::CholeskyGradOp); diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 3d8e80bfaeb8f..105d818e19743 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -12,7 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace framework { @@ -37,18 +40,6 @@ class IncrementOp : public framework::OperatorWithKernel { const framework::AttributeMap &attrs) : OperatorWithKernel(type, inputs, outputs, attrs) {} - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE_EQ(phi::product(ctx->GetInputDim("X")), 1UL, - platform::errors::InvalidArgument( - "The number of elements in Input(X) should be 1." - "Now the number is %d.", - phi::product(ctx->GetInputDim("X")))); - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "increment"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "increment"); - ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - ctx->ShareLoD("X", "Out"); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { @@ -96,6 +87,9 @@ class IncrementGradOpMaker : public framework::SingleGradOpMaker { } // namespace paddle namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(increment, IncrementInferShapeFunctor, + PT_INFER_META(phi::IncrementInferMeta)); REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, ops::IncrementGradOpMaker, - ops::IncrementGradOpMaker); + ops::IncrementGradOpMaker, + IncrementInferShapeFunctor); diff --git a/paddle/fluid/operators/multinomial_op.cc b/paddle/fluid/operators/multinomial_op.cc index 00eaa2f8e77cf..1143f9cb37aa5 100644 --- a/paddle/fluid/operators/multinomial_op.cc +++ b/paddle/fluid/operators/multinomial_op.cc @@ -16,10 +16,11 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -45,38 +46,6 @@ This OP returns a Tensor filled with the sampled categoris according to Multinom class MultinomialOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext *ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Multinomial"); - OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Multinomial"); - - auto x_dim = ctx->GetInputDim("X"); - int64_t x_rank = x_dim.size(); - PADDLE_ENFORCE_GT(x_rank, 0, - platform::errors::InvalidArgument( - "The number of dimensions of the input probability " - "distribution should be > 0, but got %d.", - x_rank)); - PADDLE_ENFORCE_LE(x_rank, 2, - platform::errors::InvalidArgument( - "The number of dimensions of the input probability " - "distribution should be <= 2, but got %d.", - x_rank)); - - std::vector out_dims(x_rank); - for (int64_t i = 0; i < x_rank - 1; i++) { - out_dims[i] = x_dim[i]; - } - - int64_t num_samples = ctx->Attrs().Get("num_samples"); - PADDLE_ENFORCE_GT( - num_samples, 0, - platform::errors::InvalidArgument( - "The number of samples should be > 0, but got %d.", num_samples)); - out_dims[x_rank - 1] = num_samples; - - ctx->SetOutputDim("Out", phi::make_ddim(out_dims)); - } }; } // namespace operators @@ -84,7 +53,10 @@ class MultinomialOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; +DELCARE_INFER_SHAPE_FUNCTOR(multinomial, MultinomialInferShapeFunctor, + PT_INFER_META(phi::MultinomialInferMeta)); REGISTER_OPERATOR( multinomial, ops::MultinomialOp, ops::MultinomialOpMaker, paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); + paddle::framework::EmptyGradOpMaker, + MultinomialInferShapeFunctor); diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt index c077e7b4c5563..f7102629d213c 100644 --- a/paddle/phi/infermeta/CMakeLists.txt +++ b/paddle/phi/infermeta/CMakeLists.txt @@ -1,2 +1,2 @@ -cc_library(infermeta SRCS nullary.cc unary.cc binary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils) +cc_library(infermeta SRCS nullary.cc unary.cc binary.cc ternary.cc multiary.cc DEPS convert_utils meta_tensor infermeta_utils) cc_library(backward_infermeta SRCS backward.cc DEPS meta_tensor convert_utils) diff --git a/paddle/phi/infermeta/ternary.cc b/paddle/phi/infermeta/ternary.cc new file mode 100644 index 0000000000000..52aeaef843854 --- /dev/null +++ b/paddle/phi/infermeta/ternary.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/infermeta/ternary.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/kernels/funcs/common_shape.h" + +namespace phi { + +void AddmmInferMeta(const MetaTensor& input, + const MetaTensor& x, + const MetaTensor& y, + float alpha, + float beta, + MetaTensor* out) { + auto input_dims = input.dims(); + auto x_dims = x.dims(); + auto y_dims = y.dims(); + + auto ndim_input = input_dims.size(); + auto ndim_x = x_dims.size(); + auto ndim_y = y_dims.size(); + + VLOG(3) << "addmm operator input.shape=" << input_dims + << " x.shape=" << x_dims << " y.shape=" << y_dims << " beta=" << beta + << " alpha=" << alpha << " ndim_input=" << ndim_input + << " ndim_x=" << ndim_x << " ndim_y=" << ndim_y; + + PADDLE_ENFORCE_NE( + product(input_dims), + 0, + errors::PreconditionNotMet("The Input variable 'input' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + + PADDLE_ENFORCE_NE( + product(x_dims), + 0, + errors::PreconditionNotMet("The Input variable 'x' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + + PADDLE_ENFORCE_NE( + product(y_dims), + 0, + errors::PreconditionNotMet("The Input variable 'y' has not " + "been initialized. You may need to confirm " + "if you put exe.run(startup_program) " + "after optimizer.minimize function.")); + // dim check + PADDLE_ENFORCE_EQ( + ndim_input, + 2, + errors::InvalidArgument("The input tensor input's dimension must be 2. " + "But received input's dimension = [%s].", + ndim_input)); + PADDLE_ENFORCE_EQ( + ndim_x, + 2, + errors::InvalidArgument("The input tensor x's dimension must be 2. " + "But received x's dimension = [%s].", + ndim_x)); + PADDLE_ENFORCE_EQ( + ndim_y, + 2, + errors::InvalidArgument("The input tensor y's dimension must be 2. " + "But received y's dimension = [%s].", + ndim_y)); + + std::vector output_dims; + output_dims.push_back(x_dims[0]); + output_dims.push_back(y_dims[1]); + + out->set_dims(make_ddim(output_dims)); + out->share_lod(input); + out->set_dtype(input.dtype()); +} + +} // namespace phi diff --git a/paddle/phi/infermeta/ternary.h b/paddle/phi/infermeta/ternary.h new file mode 100644 index 0000000000000..d6223dd87aaf8 --- /dev/null +++ b/paddle/phi/infermeta/ternary.h @@ -0,0 +1,40 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/meta_tensor.h" + +namespace phi { + +// Common InferMeta Functions for ternary operators, The format like: +// +// 1. void [FunctionDesc|OpName]InferMeta(const MetaTensor& x, +// const MetaTensor& y, +// const MetaTensor& z, +// ..., +// MetaTensor* out) {} +// +// NOTE: The name "InferShape" may be not appropriate. "InferMeta" may be good. +// Because functions in this file not only can infer shape, but also need +// infer lod or other useful data. + +void AddmmInferMeta(const MetaTensor& input, + const MetaTensor& x, + const MetaTensor& y, + float alpha, + float beta, + MetaTensor* out); + +} // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 72b88f537faf2..9b2f310e85d4b 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -81,6 +81,28 @@ void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) { out->set_layout(x.layout()); } +void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out) { + auto dims = x.dims(); + auto rank = dims.size(); + PADDLE_ENFORCE_GE(rank, + 2, + errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions. But " + "received a %d dimension tensor.", + rank)); + PADDLE_ENFORCE_EQ( + dims[rank - 2], + dims[rank - 1], + errors::InvalidArgument( + "The inner-most 2 dimensions of Input(X) all should be symmetric " + "positive-definite matrices and have the same size. But received " + "X's shape[-2] = %d and shape[-1] = %d.", + dims[rank - 2], + dims[rank - 1])); + out->set_dims(x.dims()); + out->set_dtype(x.dtype()); +} + void CopyToInferMeta(const MetaTensor& x, Backend backend, bool blocking, @@ -94,6 +116,18 @@ void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out) { out->set_layout(x.layout()); } +void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out) { + PADDLE_ENFORCE_EQ( + product(x.dims()), + 1UL, + errors::InvalidArgument("The number of elements in Input(X) should be 1." + "Now the number is %d.", + product(x.dims()))); + out->set_dims(x.dims()); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + static phi::DDim ValidateShape(const std::vector shape, const phi::DDim& in_dims) { const int64_t in_size = phi::product(in_dims); @@ -234,6 +268,41 @@ void InferMetaFromVecValue(const MetaTensor& x, } } +void MultinomialInferMeta(const MetaTensor& x, + int num_samples, + bool replacement, + MetaTensor* out) { + auto x_dim = x.dims(); + int64_t x_rank = x_dim.size(); + PADDLE_ENFORCE_GT(x_rank, + 0, + errors::InvalidArgument( + "The number of dimensions of the input probability " + "distribution should be > 0, but got %d.", + x_rank)); + PADDLE_ENFORCE_LE(x_rank, + 2, + errors::InvalidArgument( + "The number of dimensions of the input probability " + "distribution should be <= 2, but got %d.", + x_rank)); + + std::vector out_dims(x_rank); + for (int64_t i = 0; i < x_rank - 1; i++) { + out_dims[i] = x_dim[i]; + } + + PADDLE_ENFORCE_GT( + num_samples, + 0, + errors::InvalidArgument( + "The number of samples should be > 0, but got %d.", num_samples)); + out_dims[x_rank - 1] = num_samples; + + out->set_dims(make_ddim(out_dims)); + out->set_dtype(DataType::INT64); +} + void ReshapeInferMeta(const MetaTensor& x, const ScalarArray& shape, MetaTensor* out, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 1a1605bb1ce4a..40bf4e333569c 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -41,6 +41,8 @@ void FlattenInferMeta(const MetaTensor& x, void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); +void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); + void CopyToInferMeta(const MetaTensor& x, Backend backend, bool blocking, @@ -48,10 +50,17 @@ void CopyToInferMeta(const MetaTensor& x, void CreateLikeInferMeta(const MetaTensor& x, DataType dtype, MetaTensor* out); +void IncrementInferMeta(const MetaTensor& x, float value, MetaTensor* out); + void InferMetaFromVecValue(const MetaTensor& x, const std::vector& shape, MetaTensor* out); +void MultinomialInferMeta(const MetaTensor& x, + int num_samples, + bool replacement, + MetaTensor* out); + void ReshapeInferMeta(const MetaTensor& x, const ScalarArray& shape, MetaTensor* out, diff --git a/paddle/phi/kernels/cholesky_grad_kernel.h b/paddle/phi/kernels/cholesky_grad_kernel.h index 3fb532d9af7f9..b170a3d7ffcfa 100644 --- a/paddle/phi/kernels/cholesky_grad_kernel.h +++ b/paddle/phi/kernels/cholesky_grad_kernel.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. diff --git a/paddle/phi/kernels/cpu/multinomial_kernel.cc b/paddle/phi/kernels/cpu/multinomial_kernel.cc index 67e7d5bb68c61..e9c2a569e0650 100644 --- a/paddle/phi/kernels/cpu/multinomial_kernel.cc +++ b/paddle/phi/kernels/cpu/multinomial_kernel.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/multinomial_functor.h" namespace phi { @@ -32,12 +33,13 @@ void MultinomialKernel(const Context& dev_ctx, const int64_t num_categories = in_dims[in_rank - 1]; const int64_t num_distributions = in_rank > 1 ? in_dims[in_rank - 2] : 1; - MultinomialFunctor(out_data, - in_data, - num_samples, - replacement, - num_categories, - num_distributions); + funcs::MultinomialFunctor(dev_ctx, + out_data, + in_data, + num_samples, + replacement, + num_categories, + num_distributions); } } // namespace phi diff --git a/paddle/phi/kernels/funcs/multinomial_functor.h b/paddle/phi/kernels/funcs/multinomial_functor.h new file mode 100644 index 0000000000000..05a5a0faf6774 --- /dev/null +++ b/paddle/phi/kernels/funcs/multinomial_functor.h @@ -0,0 +1,122 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/enforce.h" + +namespace phi { +namespace funcs { + +template +void MultinomialFunctor(const Context& dev_ctx, + int64_t* out_data, + const T* in_data, + const int64_t num_samples, + const bool replacement, + const int64_t num_categories, + const int64_t num_distributions) { + std::vector cumulative_probs(num_categories); + + std::uniform_real_distribution dist(0, 1); + auto engine = dev_ctx.GetHostGenerator()->GetCPUEngine(); + + for (int64_t i = 0; i < num_distributions; i++) { + T probs_sum = 0; + T prob_value; + int64_t num_zeros = 0; + for (int64_t j = 0; j < num_categories; j++) { + prob_value = in_data[i * num_categories + j]; + PADDLE_ENFORCE_GE( + prob_value, + 0.0, + errors::InvalidArgument("The input of multinomial distribution " + "should be >= 0, but got %f.", + prob_value)); + + probs_sum += prob_value; + if (prob_value == 0) { + num_zeros += 1; + } + cumulative_probs[j] = probs_sum; + } + PADDLE_ENFORCE_GT( + probs_sum, + 0.0, + errors::InvalidArgument("The sum of one multinomial distribution " + "probability should be > 0, but got %f.", + probs_sum)); + PADDLE_ENFORCE_EQ( + (replacement || (num_categories - num_zeros >= num_samples)), + true, + errors::InvalidArgument("When replacement is False, number of " + "samples should be less than non-zero " + "categories.")); + + for (int64_t j = 0; j < num_categories; j++) { + cumulative_probs[j] /= probs_sum; + } + + for (int64_t s = 0; s < num_samples; s++) { + T uniform_rand = dist(*engine); + // use binary search to get the selected category sample id. + // let cumulative_probs[id-1] < uniform_rand < cumulative_probs[id]. + int64_t left = 0; + int64_t right = num_categories; + int64_t mid; + int64_t sample_id; + T temp_prob; + cumulative_probs[(num_categories - 1)] = 1; + + while (right > left) { + mid = left + (right - left) / 2; + temp_prob = cumulative_probs[mid]; + if (temp_prob < uniform_rand) { + left = mid + 1; + } else { + right = mid; + } + } + sample_id = left; + + out_data[i * num_samples + s] = sample_id; + + // if replacement is false, the selected category should be removed. + if (!replacement && s < num_samples - 1) { + T sample_prob; + T new_prob = 0; + T new_sum; + + if (sample_id != 0) { + new_prob = cumulative_probs[sample_id - 1]; + } + sample_prob = cumulative_probs[sample_id] - new_prob; + new_sum = 1.0 - sample_prob; + + for (int64_t j = 0; j < num_categories; j++) { + new_prob = cumulative_probs[j]; + if (j >= sample_id) { + new_prob -= sample_prob; + } + new_prob /= new_sum; + cumulative_probs[j] = new_prob; + } + } + } + } +} + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/multinomial_kernel.cu b/paddle/phi/kernels/gpu/multinomial_kernel.cu index ea1cf361958aa..4918495ff7bed 100644 --- a/paddle/phi/kernels/gpu/multinomial_kernel.cu +++ b/paddle/phi/kernels/gpu/multinomial_kernel.cu @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/phi/kernels/funcs/multinomial_functor.h" namespace phi { @@ -154,7 +155,7 @@ void MultinomialKernel(const Context& dev_ctx, // can // be used only once. So after every sample, probability of the distribution // will change. The implementation can't be parallelizable. Thus, call CPU - // implementation ``MultinomialFunctor`` to sample the distribution. + // implementation ``funcs::MultinomialFunctor`` to sample the distribution. if (!replacement) { int64_t in_data_numel = x.numel(); int64_t out_data_numel = out->numel(); @@ -172,12 +173,13 @@ void MultinomialKernel(const Context& dev_ctx, cudaMemcpyDeviceToHost); #endif - MultinomialFunctor(cpu_out_data, - cpu_in_data, - num_samples, - replacement, - num_categories, - num_distributions); + funcs::MultinomialFunctor(dev_ctx, + cpu_out_data, + cpu_in_data, + num_samples, + replacement, + num_categories, + num_distributions); #ifdef PADDLE_WITH_HIP hipMemcpy(out_data, diff --git a/paddle/phi/kernels/multinomial_kernel.h b/paddle/phi/kernels/multinomial_kernel.h index 70be21dc2861f..f3d8770bc1b60 100644 --- a/paddle/phi/kernels/multinomial_kernel.h +++ b/paddle/phi/kernels/multinomial_kernel.h @@ -15,109 +15,14 @@ #pragma once #include "paddle/phi/core/dense_tensor.h" -#include "paddle/fluid/framework/generator.h" namespace phi { template void MultinomialKernel(const Context& dev_ctx, - const DenseTensor& x, - int num_samples, - bool replacement, - DenseTensor* out); - -template -void MultinomialFunctor(int64_t* out_data, const T* in_data, - const int64_t num_samples, const bool replacement, - const int64_t num_categories, - const int64_t num_distributions) { - std::vector cumulative_probs(num_categories); - - std::uniform_real_distribution dist(0, 1); - auto gen_ptr = paddle::framework::DefaultCPUGenerator(); - auto engine = gen_ptr->GetCPUEngine(); - - for (int64_t i = 0; i < num_distributions; i++) { - T probs_sum = 0; - T prob_value; - int64_t num_zeros = 0; - for (int64_t j = 0; j < num_categories; j++) { - prob_value = in_data[i * num_categories + j]; - PADDLE_ENFORCE_GE(prob_value, 0.0, - errors::InvalidArgument( - "The input of multinomial distribution " - "should be >= 0, but got %f.", - prob_value)); - - probs_sum += prob_value; - if (prob_value == 0) { - num_zeros += 1; - } - cumulative_probs[j] = probs_sum; - } - PADDLE_ENFORCE_GT(probs_sum, 0.0, - errors::InvalidArgument( - "The sum of one multinomial distribution " - "probability should be > 0, but got %f.", - probs_sum)); - PADDLE_ENFORCE_EQ( - (replacement || (num_categories - num_zeros >= num_samples)), true, - errors::InvalidArgument( - "When replacement is False, number of " - "samples should be less than non-zero " - "categories.")); - - for (int64_t j = 0; j < num_categories; j++) { - cumulative_probs[j] /= probs_sum; - } - - for (int64_t s = 0; s < num_samples; s++) { - T uniform_rand = dist(*engine); - // use binary search to get the selected category sample id. - // let cumulative_probs[id-1] < uniform_rand < cumulative_probs[id]. - int64_t left = 0; - int64_t right = num_categories; - int64_t mid; - int64_t sample_id; - T temp_prob; - cumulative_probs[(num_categories - 1)] = 1; - - while (right > left) { - mid = left + (right - left) / 2; - temp_prob = cumulative_probs[mid]; - if (temp_prob < uniform_rand) { - left = mid + 1; - } else { - right = mid; - } - } - sample_id = left; - - out_data[i * num_samples + s] = sample_id; - - // if replacement is false, the selected category should be removed. - if (!replacement && s < num_samples - 1) { - T sample_prob; - T new_prob = 0; - T new_sum; - - if (sample_id != 0) { - new_prob = cumulative_probs[sample_id - 1]; - } - sample_prob = cumulative_probs[sample_id] - new_prob; - new_sum = 1.0 - sample_prob; - - for (int64_t j = 0; j < num_categories; j++) { - new_prob = cumulative_probs[j]; - if (j >= sample_id) { - new_prob -= sample_prob; - } - new_prob /= new_sum; - cumulative_probs[j] = new_prob; - } - } - } - } -} + const DenseTensor& x, + int num_samples, + bool replacement, + DenseTensor* out); } // namespace phi diff --git a/paddle/phi/ops/compat/addmm_sig.cc b/paddle/phi/ops/compat/addmm_sig.cc index 34da5fe9fe954..b3bc0bb23a71e 100644 --- a/paddle/phi/ops/compat/addmm_sig.cc +++ b/paddle/phi/ops/compat/addmm_sig.cc @@ -16,11 +16,6 @@ namespace phi { -KernelSignature AddmmOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature( - "addmm", {"Input", "X", "Y"}, {"Alpha", "Beta"}, {"Out"}); -} - KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { return KernelSignature( "addmm_grad", @@ -31,5 +26,4 @@ KernelSignature AddmmGradOpArgumentMapping(const ArgumentMappingContext& ctx) { } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(addmm, phi::AddmmOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(addmm_grad, phi::AddmmGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/cholesky_sig.cc b/paddle/phi/ops/compat/cholesky_sig.cc index 068c7f4f0a77a..8c7ca75704669 100644 --- a/paddle/phi/ops/compat/cholesky_sig.cc +++ b/paddle/phi/ops/compat/cholesky_sig.cc @@ -16,10 +16,6 @@ namespace phi { -KernelSignature CholeskyOpArgumentMapping(const ArgumentMappingContext& ctx) { - return KernelSignature("cholesky", {"X"}, {"upper"}, {"Out"}); -} - KernelSignature CholeskyGradOpArgumentMapping( const ArgumentMappingContext& ctx) { return KernelSignature("cholesky_grad", @@ -30,5 +26,4 @@ KernelSignature CholeskyGradOpArgumentMapping( } // namespace phi -PD_REGISTER_ARG_MAPPING_FN(cholesky, phi::CholeskyOpArgumentMapping); PD_REGISTER_ARG_MAPPING_FN(cholesky_grad, phi::CholeskyGradOpArgumentMapping); From d8fc7211f98c7192f5079b52f87226260c2ea03d Mon Sep 17 00:00:00 2001 From: Zhang Zheng <32410583+ZzSean@users.noreply.github.com> Date: Fri, 25 Feb 2022 16:13:47 +0800 Subject: [PATCH 58/85] Fix conflict caused by wrong namespace (#39930) --- paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 92e2adb3ee8d2..19a395e72314d 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -32,6 +32,7 @@ namespace operators { using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using DataLayout = platform::DataLayout; using Tensor = framework::Tensor; +namespace kps = phi::kps; // Wrapper of log function. Use log(float32) for float16 template @@ -500,7 +501,7 @@ template __device__ __forceinline__ void VectorizedSoftmaxForwardImpl( T* loss, T* softmax, const T* logits, const LabelT* label, int size, - const int offset, const LogSoftmaxForwardFunctor& func, + const int offset, const phi::LogSoftmaxForwardFunctor& func, const int ignore_index) { using VecT = kps::details::VectorType; int tid = threadIdx.x; @@ -583,7 +584,7 @@ template __device__ __forceinline__ void ScalarSoftmaxForwardImpl( T* loss, T* softmax, const T* logits, const LabelT* label, const int size, - const LogSoftmaxForwardFunctor& func, const int ignore_index) { + const phi::LogSoftmaxForwardFunctor& func, const int ignore_index) { int tid = threadIdx.x; int remain = size % (VecSize * blockDim.x); int label_id = blockIdx.x; @@ -658,7 +659,7 @@ __global__ void VectorizedSoftmaxForward(T* loss, T* softmax, const T* logits, sum, kps::AddFunctor()); // 3. softmax - LogSoftmaxForwardFunctor func(max, sum); + phi::LogSoftmaxForwardFunctor func(max, sum); if (input_offset == output_offset) { VectorizedSoftmaxForwardImpl( loss, softmax, logits, label, mid_dim, input_offset, func, From 4fe465cbf27609ef1c2bdc563207f3c3f6e875fb Mon Sep 17 00:00:00 2001 From: YUNSHEN XIE <1084314248@qq.com> Date: Fri, 25 Feb 2022 16:18:13 +0800 Subject: [PATCH 59/85] Disable dist ut cases (#39906) * disable some distribute test case when in CPU test env * disable some test case when in CPU test env * fix --- paddle/fluid/inference/api/CMakeLists.txt | 6 ++++-- paddle/fluid/inference/tests/api/CMakeLists.txt | 14 ++++++++++---- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 6c465e6278059..87efe5ec51903 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -56,8 +56,10 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api) if(WITH_TESTING) if (NOT APPLE AND NOT WIN32) - inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared - ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) + if (WITH_GPU) + inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_inference_shared + ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) + endif() elseif(WIN32) inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps} ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR}) diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 85fe931cf93f8..37214534f3c93 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -299,7 +299,9 @@ inference_analysis_api_test(test_analyzer_pyramid_dnn ${PYRAMID_DNN_INSTALL_DIR} set(ERNIE_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/Ernie") download_model_and_data(${ERNIE_INSTALL_DIR} "Ernie_model.tar.gz" aa59192dd41ed377f9f168e3a1309fa6 "Ernie_data.txt.tar.gz" 5396e63548edad7ca561e7e26a9476d1) download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz" 73beea65abda2edb61c1662cd3180c62) -inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc) +if (WITH_GPU) + inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR} analyzer_ernie_tester.cc) +endif() inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR} analyzer_ernie_int8_tester.cc) # Ernie large @@ -551,7 +553,9 @@ endif() # bert, max_len=20, embedding_dim=128 set(BERT_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/bert_emb128") download_model_and_data_without_verify(${BERT_INSTALL_DIR} "bert_emb128_model.tar.gz" "bert_data_len20.txt.tar.gz") -inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) +if (WITH_GPU) + inference_analysis_api_test(test_analyzer_bert ${BERT_INSTALL_DIR} analyzer_bert_tester.cc) +endif() # multiple models prediction set(MMP_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/multi_model_prediction") @@ -741,13 +745,15 @@ set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120) -set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_ernie_int8 PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120) -set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120) set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120) +if (WITH_GPU) + set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120) + set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120) +endif() if(WITH_GPU AND TENSORRT_FOUND) set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120) if(WITH_MKLDNN) From 639675de36d5853b153507b7a50244350af447fa Mon Sep 17 00:00:00 2001 From: 0x45f <23097963+0x45f@users.noreply.github.com> Date: Fri, 25 Feb 2022 18:19:46 +0800 Subject: [PATCH 60/85] =?UTF-8?q?move=20eye=E3=80=81size=E3=80=81erfinv?= =?UTF-8?q?=E3=80=81pixel=5Fshuffle=20OP=20to=20phi=20(#39712)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * move eye OP to pten * move size OP to pten * merge develop * fix merge * move files * move erfinv OP to phi * remove comment * move pixel_shuffle OP to phi * remove comment * fix PT_REGISTER * fix NPU * fix CR * remove size_sig.cc for PR-CI-Coverage --- paddle/fluid/operators/erfinv_op.cc | 15 +-- paddle/fluid/operators/erfinv_op.h | 65 ----------- paddle/fluid/operators/eye_op.cc | 8 +- paddle/fluid/operators/eye_op.cu | 24 ---- paddle/fluid/operators/eye_op.h | 61 ---------- paddle/fluid/operators/eye_op_npu.cc | 2 +- paddle/fluid/operators/pixel_shuffle_op.cc | 12 +- paddle/fluid/operators/pixel_shuffle_op.cu | 26 ----- paddle/fluid/operators/pixel_shuffle_op.h | 106 ------------------ paddle/fluid/operators/size_op.cc | 5 - paddle/fluid/operators/size_op.cu | 22 ---- paddle/fluid/operators/size_op.h | 44 -------- paddle/fluid/operators/size_op_npu.cc | 2 +- paddle/phi/kernels/cpu/erfinv_grad_kernel.cc | 22 ++++ paddle/phi/kernels/cpu/erfinv_kernel.cc | 21 ++++ paddle/phi/kernels/cpu/eye_kernel.cc | 29 +++++ .../kernels/cpu/pixel_shuffle_grad_kernel.cc | 26 +++++ .../phi/kernels/cpu/pixel_shuffle_kernel.cc | 22 ++++ paddle/phi/kernels/cpu/size_kernel.cc | 30 +++++ paddle/phi/kernels/erfinv_grad_kernel.h | 27 +++++ paddle/phi/kernels/erfinv_kernel.h | 24 ++++ paddle/phi/kernels/eye_kernel.h | 28 +++++ paddle/phi/kernels/gpu/erfinv_grad_kernel.cu | 22 ++++ paddle/phi/kernels/gpu/erfinv_kernel.cu | 21 ++++ .../kernels/gpu/eye_kernel.cu} | 25 +++-- .../kernels/gpu/pixel_shuffle_grad_kernel.cu | 26 +++++ .../phi/kernels/gpu/pixel_shuffle_kernel.cu | 22 ++++ paddle/phi/kernels/gpu/size_kernel.cu | 30 +++++ .../kernels/impl/erfinv_grad_kernel_impl.h | 39 +++++++ paddle/phi/kernels/impl/erfinv_kernel_impl.h | 35 ++++++ paddle/phi/kernels/impl/eye_kernel_impl.h | 54 +++++++++ .../impl/pixel_shuffle_grad_kernel_impl.h | 57 ++++++++++ .../kernels/impl/pixel_shuffle_kernel_impl.h | 56 +++++++++ paddle/phi/kernels/impl/size_kernel_impl.h | 39 +++++++ .../phi/kernels/pixel_shuffle_grad_kernel.h | 29 +++++ paddle/phi/kernels/pixel_shuffle_kernel.h | 29 +++++ paddle/phi/kernels/size_kernel.h | 24 ++++ paddle/phi/ops/compat/erfinv_sig.cc | 26 +++++ paddle/phi/ops/compat/eye_sig.cc | 26 +++++ paddle/phi/ops/compat/pixel_shuffle_sig.cc | 37 ++++++ 40 files changed, 819 insertions(+), 399 deletions(-) delete mode 100644 paddle/fluid/operators/erfinv_op.h delete mode 100644 paddle/fluid/operators/eye_op.cu delete mode 100644 paddle/fluid/operators/eye_op.h delete mode 100644 paddle/fluid/operators/pixel_shuffle_op.cu delete mode 100644 paddle/fluid/operators/pixel_shuffle_op.h delete mode 100644 paddle/fluid/operators/size_op.cu delete mode 100644 paddle/fluid/operators/size_op.h create mode 100644 paddle/phi/kernels/cpu/erfinv_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/erfinv_kernel.cc create mode 100644 paddle/phi/kernels/cpu/eye_kernel.cc create mode 100644 paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc create mode 100644 paddle/phi/kernels/cpu/size_kernel.cc create mode 100644 paddle/phi/kernels/erfinv_grad_kernel.h create mode 100644 paddle/phi/kernels/erfinv_kernel.h create mode 100644 paddle/phi/kernels/eye_kernel.h create mode 100644 paddle/phi/kernels/gpu/erfinv_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/erfinv_kernel.cu rename paddle/{fluid/operators/erfinv_op.cu => phi/kernels/gpu/eye_kernel.cu} (52%) create mode 100644 paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu create mode 100644 paddle/phi/kernels/gpu/size_kernel.cu create mode 100644 paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/erfinv_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/eye_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/size_kernel_impl.h create mode 100644 paddle/phi/kernels/pixel_shuffle_grad_kernel.h create mode 100644 paddle/phi/kernels/pixel_shuffle_kernel.h create mode 100644 paddle/phi/kernels/size_kernel.h create mode 100644 paddle/phi/ops/compat/erfinv_sig.cc create mode 100644 paddle/phi/ops/compat/eye_sig.cc create mode 100644 paddle/phi/ops/compat/pixel_shuffle_sig.cc diff --git a/paddle/fluid/operators/erfinv_op.cc b/paddle/fluid/operators/erfinv_op.cc index 56a6a80b45dff..f9489a7bd0947 100644 --- a/paddle/fluid/operators/erfinv_op.cc +++ b/paddle/fluid/operators/erfinv_op.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/erfinv_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -85,16 +85,3 @@ REGISTER_OPERATOR( paddle::operators::ErfinvInplaceInferer); REGISTER_OPERATOR(erfinv_grad, paddle::operators::ErfinvGradOp); - -REGISTER_OP_CPU_KERNEL( - erfinv, - paddle::operators::ErfinvKernel, - paddle::operators::ErfinvKernel); - -REGISTER_OP_CPU_KERNEL( - erfinv_grad, - paddle::operators::ErfinvGradKernel, - paddle::operators::ErfinvGradKernel); diff --git a/paddle/fluid/operators/erfinv_op.h b/paddle/fluid/operators/erfinv_op.h deleted file mode 100644 index 934d0f4a5a715..0000000000000 --- a/paddle/fluid/operators/erfinv_op.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once -#ifndef _USE_MATH_DEFINES -#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows -#endif -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" - -namespace paddle { -namespace operators { - -// ndtri(x * 0.5 + 0.5) / sqrt(2) -template -class ErfinvKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto in = ctx.Input("X"); - auto out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto eigen_in = framework::EigenVector::Flatten(*in); - auto eigen_out = framework::EigenVector::Flatten(*out); - auto& place = *ctx.template device_context().eigen_device(); - constexpr T half = static_cast(0.5); - constexpr T half_sqrt = static_cast(M_SQRT1_2); - eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; - } -}; - -// sqrt(pi) / 2 * exp(square(out)) * grad -template -class ErfinvGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto out = ctx.Input("Out"); - auto dout = ctx.Input(framework::GradVarName("Out")); - auto dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_dout = framework::EigenVector::Flatten(*dout); - auto eigen_dx = framework::EigenVector::Flatten(*dx); - auto& place = *ctx.template device_context().eigen_device(); - - constexpr T half_sqrt_pi = static_cast(1 / M_2_SQRTPI); - eigen_dx.device(place) = - half_sqrt_pi * eigen_dout * eigen_out.square().exp(); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/eye_op.cc b/paddle/fluid/operators/eye_op.cc index 793519b401821..8f8a0f174a79f 100644 --- a/paddle/fluid/operators/eye_op.cc +++ b/paddle/fluid/operators/eye_op.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eye_op.h" +#include "paddle/fluid/framework/op_registry.h" namespace paddle { namespace operators { @@ -82,14 +82,8 @@ Return an identity tensor whose shape is [num_rows, num_columns]. } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUDeviceContext; REGISTER_OPERATOR( eye, ops::EyeOp, ops::EyeOpMaker, ops::EyeOpVarTypeInference, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); - -REGISTER_OP_CPU_KERNEL(eye, ops::EyeKernel, - ops::EyeKernel, - ops::EyeKernel, ops::EyeKernel, - ops::EyeKernel); diff --git a/paddle/fluid/operators/eye_op.cu b/paddle/fluid/operators/eye_op.cu deleted file mode 100644 index 8d55235a54c70..0000000000000 --- a/paddle/fluid/operators/eye_op.cu +++ /dev/null @@ -1,24 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/eye_op.h" - -namespace ops = paddle::operators; -namespace plf = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - eye, ops::EyeKernel, - ops::EyeKernel, - ops::EyeKernel, - ops::EyeKernel, - ops::EyeKernel); diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h deleted file mode 100644 index 4cec5387e82aa..0000000000000 --- a/paddle/fluid/operators/eye_op.h +++ /dev/null @@ -1,61 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include -#include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/for_range.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -struct EyeFunctor { - EyeFunctor(int64_t num_columns, T* output) - : num_columns_(num_columns), output_(output) {} - - HOSTDEVICE void operator()(size_t idx) const { - output_[idx * num_columns_ + idx] = static_cast(1); - } - - int64_t num_columns_; - T* output_; -}; - -template -class EyeKernel : public framework::OpKernel { - public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { - auto num_rows = ctx.Attr("num_rows"); - auto num_columns = ctx.Attr("num_columns"); - if (num_columns == -1) num_columns = num_rows; - - auto* out_tensor = ctx.Output("Out"); - T* out_data = out_tensor->mutable_data(ctx.GetPlace()); - - phi::funcs::SetConstant set_zero; - auto& dev_ctx = ctx.template device_context(); - set_zero(dev_ctx, out_tensor, static_cast(0)); - - int64_t num_eyes = (std::min)(num_rows, num_columns); - platform::ForRange for_range(dev_ctx, num_eyes); - EyeFunctor functor(num_columns, out_data); - for_range(functor); - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc index e109e9d02a03a..5ee3202af135b 100644 --- a/paddle/fluid/operators/eye_op_npu.cc +++ b/paddle/fluid/operators/eye_op_npu.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/eye_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/fluid/operators/pixel_shuffle_op.cc b/paddle/fluid/operators/pixel_shuffle_op.cc index cb9bbe727de5c..f5c53ab320db7 100644 --- a/paddle/fluid/operators/pixel_shuffle_op.cc +++ b/paddle/fluid/operators/pixel_shuffle_op.cc @@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/pixel_shuffle_op.h" #include +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -177,16 +177,6 @@ REGISTER_OPERATOR(pixel_shuffle, ops::PixelShuffleOp, ops::PixelShuffleOpMaker, REGISTER_OPERATOR(pixel_shuffle_grad, ops::PixelShuffleGradOp); -REGISTER_OP_CPU_KERNEL( - pixel_shuffle, - ops::PixelShuffleOpKernel, - ops::PixelShuffleOpKernel); - -REGISTER_OP_CPU_KERNEL( - pixel_shuffle_grad, - ops::PixelShuffleGradOpKernel, - ops::PixelShuffleGradOpKernel); - REGISTER_OP_VERSION(pixel_shuffle) .AddCheckpoint( R"ROC( diff --git a/paddle/fluid/operators/pixel_shuffle_op.cu b/paddle/fluid/operators/pixel_shuffle_op.cu deleted file mode 100644 index 6faf91079e1da..0000000000000 --- a/paddle/fluid/operators/pixel_shuffle_op.cu +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/pixel_shuffle_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - pixel_shuffle, ops::PixelShuffleOpKernel, - ops::PixelShuffleOpKernel); -REGISTER_OP_CUDA_KERNEL( - pixel_shuffle_grad, - ops::PixelShuffleGradOpKernel, - ops::PixelShuffleGradOpKernel); diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h deleted file mode 100644 index 615bc97721674..0000000000000 --- a/paddle/fluid/operators/pixel_shuffle_op.h +++ /dev/null @@ -1,106 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -template -class PixelShuffleOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* out = ctx.Output("Out"); - - out->mutable_data(ctx.GetPlace()); - - int factor = ctx.Attr("upscale_factor"); - - std::string data_format = ctx.Attr("data_format"); - bool channel_last = (data_format == "NHWC"); - - auto in_dims = in->dims(); - auto o_dims = out->dims(); - - framework::Tensor t; - t.ShareDataWith(*in); - if (!channel_last) { - t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]}); - } else { - t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor}); - } - std::vector axis = {0, 1, 4, 2, 5, 3}; - - framework::Tensor o; - o.ShareDataWith(*out); - if (!channel_last) { - o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor}); - } else { - o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]}); - } - phi::funcs::Transpose trans; - auto& dev_ctx = ctx.template device_context(); - trans(dev_ctx, t, &o, axis); - out->Resize(o_dims); - } -}; - -template -class PixelShuffleGradOpKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - dx->mutable_data(ctx.GetPlace()); - - int factor = ctx.Attr("upscale_factor"); - - std::string data_format = ctx.Attr("data_format"); - bool channel_last = (data_format == "NHWC"); - - auto do_dims = dout->dims(); - auto dx_dims = dx->dims(); - - framework::Tensor t; - t.ShareDataWith(*dout); - if (!channel_last) { - t.Resize( - {do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor}); - } else { - t.Resize( - {do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]}); - } - std::vector axis = {0, 1, 3, 5, 2, 4}; - - framework::Tensor o; - o.ShareDataWith(*dx); - if (!channel_last) { - o.Resize( - {do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]}); - } else { - o.Resize( - {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor}); - } - phi::funcs::Transpose trans; - auto& dev_ctx = ctx.template device_context(); - trans(dev_ctx, t, &o, axis); - dx->Resize(dx_dims); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/size_op.cc b/paddle/fluid/operators/size_op.cc index 70733d643673a..c27a02e46f332 100644 --- a/paddle/fluid/operators/size_op.cc +++ b/paddle/fluid/operators/size_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/size_op.h" #include "paddle/fluid/framework/op_registry.h" namespace paddle { @@ -53,7 +52,3 @@ REGISTER_OPERATOR( size, ops::SizeOp, ops::SizeOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker); -REGISTER_OP_CPU_KERNEL(size, ops::SizeKernel, ops::SizeKernel, - ops::SizeKernel, - ops::SizeKernel, ops::SizeKernel, - ops::SizeKernel); diff --git a/paddle/fluid/operators/size_op.cu b/paddle/fluid/operators/size_op.cu deleted file mode 100644 index de56ecd952705..0000000000000 --- a/paddle/fluid/operators/size_op.cu +++ /dev/null @@ -1,22 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/size_op.h" - -REGISTER_OP_CUDA_KERNEL( - size, paddle::operators::SizeKernel, - paddle::operators::SizeKernel, - paddle::operators::SizeKernel, - paddle::operators::SizeKernel, paddle::operators::SizeKernel, - paddle::operators::SizeKernel); diff --git a/paddle/fluid/operators/size_op.h b/paddle/fluid/operators/size_op.h deleted file mode 100644 index 8840fde287d66..0000000000000 --- a/paddle/fluid/operators/size_op.h +++ /dev/null @@ -1,44 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/op_registry.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; - -template -class SizeKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("Input"); - auto* out_t = ctx.Output("Out"); - auto place = ctx.GetPlace(); - auto out_data = out_t->mutable_data(place); - auto cpu_place = platform::CPUPlace(); - if (place == cpu_place) { - out_data[0] = in_t->numel(); - } else { - Tensor cpu_tensor; - auto cpu_data = - cpu_tensor.mutable_data(out_t->dims(), cpu_place); - cpu_data[0] = in_t->numel(); - paddle::framework::TensorCopy(cpu_tensor, place, out_t); - } - } -}; -} // namespace operators -} // namespace paddle diff --git a/paddle/fluid/operators/size_op_npu.cc b/paddle/fluid/operators/size_op_npu.cc index 5826d2b4a8742..95b97025f2969 100644 --- a/paddle/fluid/operators/size_op_npu.cc +++ b/paddle/fluid/operators/size_op_npu.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/npu/npu_op_runner.h" namespace paddle { diff --git a/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc new file mode 100644 index 0000000000000..b1fe4f026ab07 --- /dev/null +++ b/paddle/phi/kernels/cpu/erfinv_grad_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/erfinv_grad_kernel.h" +#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + erfinv_grad, CPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/erfinv_kernel.cc b/paddle/phi/kernels/cpu/erfinv_kernel.cc new file mode 100644 index 0000000000000..4f3a740f9d9be --- /dev/null +++ b/paddle/phi/kernels/cpu/erfinv_kernel.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/erfinv_kernel.h" +#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(erfinv, CPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/eye_kernel.cc b/paddle/phi/kernels/cpu/eye_kernel.cc new file mode 100644 index 0000000000000..a0d0f2c439096 --- /dev/null +++ b/paddle/phi/kernels/cpu/eye_kernel.cc @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/eye_kernel.h" +#include "paddle/phi/kernels/impl/eye_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(eye, + CPU, + ALL_LAYOUT, + phi::EyeKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc new file mode 100644 index 0000000000000..b32065d4f0a14 --- /dev/null +++ b/paddle/phi/kernels/cpu/pixel_shuffle_grad_kernel.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h" +#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pixel_shuffle_grad, + CPU, + ALL_LAYOUT, + phi::PixelShuffleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc new file mode 100644 index 0000000000000..80f8fa7b50efb --- /dev/null +++ b/paddle/phi/kernels/cpu/pixel_shuffle_kernel.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/pixel_shuffle_kernel.h" +#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + pixel_shuffle, CPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {} diff --git a/paddle/phi/kernels/cpu/size_kernel.cc b/paddle/phi/kernels/cpu/size_kernel.cc new file mode 100644 index 0000000000000..ff34ef26f6bd3 --- /dev/null +++ b/paddle/phi/kernels/cpu/size_kernel.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/size_kernel.h" +#include "paddle/phi/kernels/impl/size_kernel_impl.h" + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(size, + CPU, + ALL_LAYOUT, + phi::SizeKernel, + int, + int64_t, + phi::dtype::float16, + float, + double, + bool) {} diff --git a/paddle/phi/kernels/erfinv_grad_kernel.h b/paddle/phi/kernels/erfinv_grad_kernel.h new file mode 100644 index 0000000000000..67e70ad38caf4 --- /dev/null +++ b/paddle/phi/kernels/erfinv_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ErfinvGradKernel(const Context& ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/erfinv_kernel.h b/paddle/phi/kernels/erfinv_kernel.h new file mode 100644 index 0000000000000..8380a62971ba4 --- /dev/null +++ b/paddle/phi/kernels/erfinv_kernel.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/eye_kernel.h b/paddle/phi/kernels/eye_kernel.h new file mode 100644 index 0000000000000..8b21b8ae40562 --- /dev/null +++ b/paddle/phi/kernels/eye_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void EyeKernel(const Context& ctx, + int64_t num_rows, + int64_t num_columns, + int dtype, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu new file mode 100644 index 0000000000000..50fbfddf0432e --- /dev/null +++ b/paddle/phi/kernels/gpu/erfinv_grad_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/erfinv_grad_kernel.h" +#include "paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + erfinv_grad, GPU, ALL_LAYOUT, phi::ErfinvGradKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/erfinv_kernel.cu b/paddle/phi/kernels/gpu/erfinv_kernel.cu new file mode 100644 index 0000000000000..10df0bdf5603c --- /dev/null +++ b/paddle/phi/kernels/gpu/erfinv_kernel.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/erfinv_kernel.h" +#include "paddle/phi/kernels/impl/erfinv_kernel_impl.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(erfinv, GPU, ALL_LAYOUT, phi::ErfinvKernel, float, double) {} diff --git a/paddle/fluid/operators/erfinv_op.cu b/paddle/phi/kernels/gpu/eye_kernel.cu similarity index 52% rename from paddle/fluid/operators/erfinv_op.cu rename to paddle/phi/kernels/gpu/eye_kernel.cu index 1fb2dbb97a2df..069310b0d1562 100644 --- a/paddle/fluid/operators/erfinv_op.cu +++ b/paddle/phi/kernels/gpu/eye_kernel.cu @@ -12,17 +12,18 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/erfinv_op.h" +#include "paddle/phi/kernels/eye_kernel.h" +#include "paddle/phi/kernels/impl/eye_kernel_impl.h" -REGISTER_OP_CUDA_KERNEL( - erfinv, - paddle::operators::ErfinvKernel, - paddle::operators::ErfinvKernel); +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" -REGISTER_OP_CUDA_KERNEL( - erfinv_grad, - paddle::operators::ErfinvGradKernel, - paddle::operators::ErfinvGradKernel); +PD_REGISTER_KERNEL(eye, + GPU, + ALL_LAYOUT, + phi::EyeKernel, + float, + double, + int64_t, + int, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu new file mode 100644 index 0000000000000..6b82cbc67485b --- /dev/null +++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h" +#include "paddle/phi/kernels/pixel_shuffle_grad_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(pixel_shuffle_grad, + GPU, + ALL_LAYOUT, + phi::PixelShuffleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu new file mode 100644 index 0000000000000..25b240c6c1a3b --- /dev/null +++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu @@ -0,0 +1,22 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h" +#include "paddle/phi/kernels/pixel_shuffle_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL( + pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/size_kernel.cu b/paddle/phi/kernels/gpu/size_kernel.cu new file mode 100644 index 0000000000000..17a39944eb04f --- /dev/null +++ b/paddle/phi/kernels/gpu/size_kernel.cu @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/impl/size_kernel_impl.h" +#include "paddle/phi/kernels/size_kernel.h" + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(size, + GPU, + ALL_LAYOUT, + phi::SizeKernel, + int, + int64_t, + phi::dtype::float16, + float, + double, + bool) {} diff --git a/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h new file mode 100644 index 0000000000000..ae76574e04e71 --- /dev/null +++ b/paddle/phi/kernels/impl/erfinv_grad_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void ErfinvGradKernel(const Context& ctx, + const DenseTensor& out, + const DenseTensor& out_grad, + DenseTensor* x_grad) { + ctx.template Alloc(x_grad); + auto eigen_out = EigenVector::Flatten(out); + auto eigen_dout = EigenVector::Flatten(out_grad); + auto eigen_dx = EigenVector::Flatten(*x_grad); + auto& place = *ctx.eigen_device(); + constexpr T half_sqrt_pi = static_cast(1 / M_2_SQRTPI); + eigen_dx.device(place) = half_sqrt_pi * eigen_dout * eigen_out.square().exp(); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/erfinv_kernel_impl.h b/paddle/phi/kernels/impl/erfinv_kernel_impl.h new file mode 100644 index 0000000000000..c0fb8a01b9971 --- /dev/null +++ b/paddle/phi/kernels/impl/erfinv_kernel_impl.h @@ -0,0 +1,35 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifndef _USE_MATH_DEFINES +#define _USE_MATH_DEFINES // use M_2_SQRTPI on Windows +#endif +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void ErfinvKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { + ctx.template Alloc(out); + auto eigen_in = EigenVector::Flatten(x); + auto eigen_out = EigenVector::Flatten(*out); + auto& place = *ctx.eigen_device(); + constexpr T half = static_cast(0.5); + constexpr T half_sqrt = static_cast(M_SQRT1_2); + eigen_out.device(place) = (eigen_in * half + half).ndtri() * half_sqrt; +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/eye_kernel_impl.h b/paddle/phi/kernels/impl/eye_kernel_impl.h new file mode 100644 index 0000000000000..453652273a25b --- /dev/null +++ b/paddle/phi/kernels/impl/eye_kernel_impl.h @@ -0,0 +1,54 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct EyeFunctor { + EyeFunctor(int64_t num_columns, T* output) + : num_columns_(num_columns), output_(output) {} + + HOSTDEVICE void operator()(size_t idx) const { + output_[idx * num_columns_ + idx] = static_cast(1); + } + + int64_t num_columns_; + T* output_; +}; + +template +void EyeKernel(const Context& ctx, + int64_t num_rows, + int64_t num_columns, + int dtype, + DenseTensor* out) { + auto num = num_columns; + if (num == -1) { + num = num_rows; + } + T* out_data = ctx.template Alloc(out); + phi::funcs::SetConstant set_zero; + set_zero(ctx, out, static_cast(0)); + int64_t num_eyes = (std::min)(num_rows, num); + paddle::platform::ForRange for_range(ctx, num_eyes); + EyeFunctor functor(num, out_data); + for_range(functor); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h new file mode 100644 index 0000000000000..db19a04337932 --- /dev/null +++ b/paddle/phi/kernels/impl/pixel_shuffle_grad_kernel_impl.h @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void PixelShuffleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int upscale_factor, + const std::string& data_format, + DenseTensor* x_grad) { + auto* dout = &out_grad; + auto* dx = x_grad; + ctx.template Alloc(dx); + int factor = upscale_factor; + bool channel_last = (data_format == "NHWC"); + auto do_dims = dout->dims(); + auto dx_dims = dx->dims(); + + DenseTensor t(*dout); + if (!channel_last) { + t.Resize({do_dims[0], do_dims[1], dx_dims[2], factor, dx_dims[3], factor}); + } else { + t.Resize({do_dims[0], dx_dims[1], factor, dx_dims[2], factor, do_dims[3]}); + } + std::vector axis = {0, 1, 3, 5, 2, 4}; + + DenseTensor o(*dx); + if (!channel_last) { + o.Resize({do_dims[0], do_dims[1], factor, factor, dx_dims[2], dx_dims[3]}); + } else { + o.Resize({do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor}); + } + phi::funcs::Transpose trans; + trans(ctx, t, &o, axis); + dx->Resize(dx_dims); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h new file mode 100644 index 0000000000000..2303db4ea57d6 --- /dev/null +++ b/paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h @@ -0,0 +1,56 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +void PixelShuffleKernel(const Context& ctx, + const DenseTensor& x, + int upscale_factor, + const std::string& data_format, + DenseTensor* out) { + auto* in = &x; + ctx.template Alloc(out); + int factor = upscale_factor; + bool channel_last = (data_format == "NHWC"); + auto in_dims = in->dims(); + auto o_dims = out->dims(); + + DenseTensor t(*in); + if (!channel_last) { + t.Resize({in_dims[0], o_dims[1], factor, factor, in_dims[2], in_dims[3]}); + } else { + t.Resize({in_dims[0], in_dims[1], in_dims[2], o_dims[3], factor, factor}); + } + std::vector axis = {0, 1, 4, 2, 5, 3}; + + DenseTensor o(*out); + if (!channel_last) { + o.Resize({in_dims[0], o_dims[1], in_dims[2], factor, in_dims[3], factor}); + } else { + o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]}); + } + phi::funcs::Transpose trans; + trans(ctx, t, &o, axis); + out->Resize(o_dims); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/impl/size_kernel_impl.h new file mode 100644 index 0000000000000..9a873871d75fd --- /dev/null +++ b/paddle/phi/kernels/impl/size_kernel_impl.h @@ -0,0 +1,39 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/copy_kernel.h" + +namespace phi { + +template +void SizeKernel(const Context& ctx, + const DenseTensor& input, + DenseTensor* out) { + auto place = ctx.GetPlace(); + auto out_data = ctx.template Alloc(out); + auto cpu_place = phi::CPUPlace(); + if (place == cpu_place) { + out_data[0] = input.numel(); + } else { + DenseTensor cpu_tensor; + cpu_tensor.Resize(out->dims()); + auto cpu_data = ctx.template HostAlloc(&cpu_tensor); + cpu_data[0] = input.numel(); + phi::Copy(ctx, cpu_tensor, false, out); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/pixel_shuffle_grad_kernel.h b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h new file mode 100644 index 0000000000000..be57de5da4053 --- /dev/null +++ b/paddle/phi/kernels/pixel_shuffle_grad_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PixelShuffleGradKernel(const Context& ctx, + const DenseTensor& out_grad, + int upscale_factor, + const std::string& data_format, + DenseTensor* x_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/pixel_shuffle_kernel.h b/paddle/phi/kernels/pixel_shuffle_kernel.h new file mode 100644 index 0000000000000..18b9ab9c21fdc --- /dev/null +++ b/paddle/phi/kernels/pixel_shuffle_kernel.h @@ -0,0 +1,29 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void PixelShuffleKernel(const Context& ctx, + const DenseTensor& x, + int upscale_factor, + const std::string& data_format, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/size_kernel.h b/paddle/phi/kernels/size_kernel.h new file mode 100644 index 0000000000000..2d7a29104db08 --- /dev/null +++ b/paddle/phi/kernels/size_kernel.h @@ -0,0 +1,24 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void SizeKernel(const Context& ctx, const DenseTensor& input, DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/erfinv_sig.cc b/paddle/phi/ops/compat/erfinv_sig.cc new file mode 100644 index 0000000000000..490573191533f --- /dev/null +++ b/paddle/phi/ops/compat/erfinv_sig.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature ErfinvGradOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "erfinv_grad", {"Out", GradVarName("Out")}, {}, {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(erfinv_grad, phi::ErfinvGradOpArgumentMapping); diff --git a/paddle/phi/ops/compat/eye_sig.cc b/paddle/phi/ops/compat/eye_sig.cc new file mode 100644 index 0000000000000..6dafb642795d1 --- /dev/null +++ b/paddle/phi/ops/compat/eye_sig.cc @@ -0,0 +1,26 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature EyeOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature( + "eye", {}, {"num_rows", "num_columns", "dtype"}, {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(eye, phi::EyeOpArgumentMapping); diff --git a/paddle/phi/ops/compat/pixel_shuffle_sig.cc b/paddle/phi/ops/compat/pixel_shuffle_sig.cc new file mode 100644 index 0000000000000..641288cf12ae2 --- /dev/null +++ b/paddle/phi/ops/compat/pixel_shuffle_sig.cc @@ -0,0 +1,37 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature PixelShuffleOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "pixel_shuffle", {"X"}, {"upscale_factor", "data_format"}, {"Out"}); +} + +KernelSignature PixelShuffleGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("pixel_shuffle_grad", + {GradVarName("Out")}, + {"upscale_factor", "data_format"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle, phi::PixelShuffleOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(pixel_shuffle_grad, + phi::PixelShuffleGradOpArgumentMapping); From d32a01028b83a91ac0be421b0f0bad06dc993798 Mon Sep 17 00:00:00 2001 From: sneaxiy <32832641+sneaxiy@users.noreply.github.com> Date: Fri, 25 Feb 2022 18:57:54 +0800 Subject: [PATCH 61/85] Add MultiTensorApply to calculate L2-Norm in DistributedFusedLamb optimizer (#39900) * add multi tensor apply l2 norm * add multi_tensor_apply code * make sizeof(TensorMeta) smalller * move code to distributed_fused_lamb_op.cu * remove useless FLAGS --- .../distributed_fused_lamb_init_op.cu | 22 +- .../optimizers/distributed_fused_lamb_op.cc | 7 +- .../optimizers/distributed_fused_lamb_op.cu | 282 +++++++++++------- .../operators/optimizers/multi_tensor_apply.h | 156 ++++++++++ .../optimizer/distributed_fused_lamb.py | 4 +- 5 files changed, 355 insertions(+), 116 deletions(-) create mode 100644 paddle/fluid/operators/optimizers/multi_tensor_apply.h diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu index 3bb605d7f553e..3445e9b658bec 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu @@ -284,6 +284,16 @@ static void CopyVectorToTensor(const std::vector &src, memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream); } +template +static void CopyVectorToCPUTensor(const std::vector &src, + framework::Tensor *dst) { + dst->Resize({static_cast(src.size())}); + T *dst_ptr = dst->mutable_data(platform::CPUPlace()); + const T *src_ptr = src.data(); + auto nbytes = src.size() * sizeof(T); + std::memcpy(dst_ptr, src_ptr, nbytes); +} + template class DistributedFusedLambInitOpKernel : public framework::OpKernel { @@ -677,14 +687,14 @@ class DistributedFusedLambInitOpKernel lengths.back()); } - CopyVectorToTensor( + CopyVectorToCPUTensor(numel_offsets, + ctx.Output("FusedParamOffsets")); + CopyVectorToCPUTensor( fp32_partial_numel_offsets, - ctx.Output("FP32ShardFusedParamOffsets"), place, - stream); - CopyVectorToTensor( + ctx.Output("FP32ShardFusedParamOffsets")); + CopyVectorToCPUTensor( fp16_partial_numel_offsets, - ctx.Output("FP16ShardFusedParamOffsets"), place, - stream); + ctx.Output("FP16ShardFusedParamOffsets")); // Fill the weight decay tensor PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(), diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc index 748f8206adbc7..e5b27446eb330 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc @@ -33,12 +33,7 @@ class DistributedFusedLambOp : public framework::OperatorWithKernel { framework::OpKernelType GetKernelTypeForVar( const std::string &var_name, const framework::Tensor &tensor, const framework::OpKernelType &expected_kernel_type) const override { - if (var_name == "ParamInfo") { - return expected_kernel_type; - } else { - return framework::OperatorWithKernel::GetKernelTypeForVar( - var_name, tensor, expected_kernel_type); - } + return expected_kernel_type; } }; diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu index aeecea8a8e0c1..3f90140f77282 100644 --- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu +++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu @@ -14,8 +14,10 @@ #include #include "paddle/fluid/memory/buffer.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/cast_with_ptr.h" #include "paddle/fluid/operators/optimizers/distributed_fused_lamb_op.h" +#include "paddle/fluid/operators/optimizers/multi_tensor_apply.h" #include "paddle/fluid/operators/tensor_to_string.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/collective_helper.h" @@ -40,6 +42,163 @@ namespace operators { template using MasterT = typename details::MPTypeTrait::Type; +template +static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { + static_assert(!std::is_same::value, "T cannot be void."); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); +#else + PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); +#endif +} + +template +struct L2NormFunctor { + DEVICE void operator()(int tensor_id, int chunk_id, int offset, int size, + const T *x, MasterT *y, int max_chunk_num) const { + using MT = MasterT; + const T *ptr = x + offset; + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage storage; + + MT square_sum = static_cast(0); + int i; + for (i = threadIdx.x * VecSize; i + VecSize <= size; + i += (BlockDim * VecSize)) { + platform::AlignedVector tmp_vec; + platform::Load(ptr + i, &tmp_vec); +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + auto tmp = static_cast(tmp_vec[j]); + square_sum += (tmp * tmp); + } + } + + for (; i < size; ++i) { + auto tmp = static_cast(ptr[i]); + square_sum += (tmp * tmp); + } + + square_sum = BlockReduce(storage).Reduce(square_sum, cub::Sum()); + if (threadIdx.x == 0) { + y[tensor_id * max_chunk_num + chunk_id] = square_sum; + } + } +}; + +template +static __global__ void MultiTensorL2NormReduceAgainCUDAKernel( + const InT *x, OutT *y, int max_chunk_num) { + int tensor_id = blockIdx.x; + x += (tensor_id * max_chunk_num); + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage storage; + InT sum = static_cast(0); + for (int i = threadIdx.x; i < max_chunk_num; i += BlockDim) { + sum += x[i]; + } + sum = BlockReduce(storage).Reduce(sum, cub::Sum()); + if (threadIdx.x == 0) { + if (NeedSqrt) { + y[blockIdx.x] = static_cast(sqrtf(sum)); + } else { + y[blockIdx.x] = static_cast(sum); + } + } +} + +template +static int GetChunkedVecSize(const T *ptr, int chunk_size) { + static_assert(!std::is_same::value, "T cannot be void."); + + constexpr int max_load_bits = 128; + int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); + auto address = reinterpret_cast(ptr); + constexpr int vec8 = alignof(platform::AlignedVector); + constexpr int vec4 = alignof(platform::AlignedVector); + constexpr int vec2 = alignof(platform::AlignedVector); + if (address % vec8 == 0 && chunk_size % vec8 == 0) { + return std::min(8, valid_vec_size); + } else if (address % vec4 == 0 && chunk_size % vec4 == 0) { + return std::min(4, valid_vec_size); + } else if (address % vec2 == 0 && chunk_size % vec2 == 0) { + return std::min(2, valid_vec_size); + } else { + return 1; + } +} + +#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \ + case __vec_size: { \ + constexpr int kVecSize = __vec_size; \ + __VA_ARGS__; \ + break; \ + } + +#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...) \ + do { \ + switch (__vec_size) { \ + PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \ + PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \ + PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \ + PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \ + } \ + } while (0) + +// TODO(zengjinle): which chunk_size is better? +template +static void MultiTensorL2Norm(const platform::CUDAPlace &place, + gpuStream_t stream, const InT *x, + const int *offsets, int n, OutT *y, + int chunk_size = 65536) { + if (n <= 0) return; + + constexpr int kNumTensor = MaxTensorNumPerLaunch; + constexpr int kNumChunk = MaxChunkNumPerLaunch; + constexpr int kBlockDim = BlockDim; + + int max_chunk_num = -1; + int vec_size = 8; + int total_chunk_num = 0; + for (int i = 0; i < n; ++i) { + vec_size = std::min( + vec_size, GetChunkedVecSize(x + offsets[i] - offsets[0], chunk_size)); + int length = offsets[i + 1] - offsets[i]; + auto tmp_chunk_num = (length + chunk_size - 1) / chunk_size; + max_chunk_num = std::max(max_chunk_num, tmp_chunk_num); + total_chunk_num += tmp_chunk_num; + } + + VLOG(1) << "MultiTensorL2Norm max_chunk_num = " << max_chunk_num + << " , total_chunk_num = " << total_chunk_num + << " , tensor_num = " << n; + + using MT = MasterT; + memory::Buffer tmp_out(place); + auto *tmp_out_ptr = tmp_out.Alloc(n * max_chunk_num); + FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream); + +#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL \ + do { \ + using FunctorT = L2NormFunctor; \ + VLOG(10) << __func__ << " " << typeid(InT).name() \ + << " VecSize = " << kVecSize; \ + MultiTensorApply( \ + FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \ + max_chunk_num); \ + } while (0) + + PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL); +#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL + + MultiTensorL2NormReduceAgainCUDAKernel<<>>( + tmp_out_ptr, y, max_chunk_num); +} + template static void LogParamAndTrustRatioDivSquareNorm( const framework::ExecutionContext &ctx, const float *param_square_norm, @@ -620,76 +779,6 @@ static void CubDeviceReduce(InputIteratorT d_in, OutputIteratorT d_out, num_items, reduction_op, init, stream)); } -template -static void CubDeviceSegmentedReduce(InputIteratorT d_in, OutputIteratorT d_out, - int num_segments, - OffsetIteratorT d_begin_offsets, - OffsetIteratorT d_end_offsets, - ReductionOp reduction_op, T initial_value, - gpuStream_t stream, - memory::Buffer *buffer) { - void *d_temp_storage = nullptr; - size_t temp_storage_bytes = 0; - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, - d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream)); - d_temp_storage = buffer->Alloc(temp_storage_bytes); - PADDLE_ENFORCE_GPU_SUCCESS(cub::DeviceSegmentedReduce::Reduce( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, - d_begin_offsets, d_end_offsets, reduction_op, initial_value, stream)); -} - -template -struct AddConstantFunctor { - explicit AddConstantFunctor(T bias) : bias_(bias) {} - - T operator()(T x) const { return x + bias_; } - - private: - T bias_; -}; - -template -struct OffsetWithBiasFunctor { - OffsetWithBiasFunctor(const T *offset, T bias) - : offset_(offset), bias_(bias) {} - - HOSTDEVICE T operator()(T idx) const { return offset_[idx] - bias_; } - - HOSTDEVICE constexpr bool operator==(const OffsetWithBiasFunctor &) const { - return true; - } - - private: - const T *offset_; - const T bias_; -}; - -template -static void CubDeviceSegmentedSquareNorm(const T *x, MasterT *y, int n, - const OffsetT *offset, - OffsetT init_offset, - gpuStream_t stream, - memory::Buffer *buffer) { - if (n <= 0) return; - cub::TransformInputIterator, SquareFunctor, const T *> iter( - x, SquareFunctor()); - if (init_offset == static_cast(0)) { - CubDeviceSegmentedReduce(iter, y, n, offset, offset + 1, cub::Sum(), - static_cast>(0), stream, buffer); - } else { - cub::CountingInputIterator cnt_iter(0); - OffsetWithBiasFunctor functor(offset, init_offset); - cub::TransformInputIterator, - cub::CountingInputIterator> - offset_iter(cnt_iter, functor); - CubDeviceSegmentedReduce(iter, y, n, offset_iter, offset_iter + 1, - cub::Sum(), static_cast>(0), stream, - buffer); - } -} - template static void GetSquareGradNormImpl(const T *grad, int n, float *square_norm, gpuStream_t stream, @@ -862,16 +951,6 @@ static void CheckHasNanInfGrad(const float *fp32_grad, int fp32_numel, } } -template -static void FillZeroWithPtr(T *x, size_t n, gpuStream_t stream) { - static_assert(!std::is_same::value, "T cannot be void."); -#ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS(hipMemsetAsync(x, 0, n * sizeof(T), stream)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(cudaMemsetAsync(x, 0, n * sizeof(T), stream)); -#endif -} - template class DistributedFusedLambOpKernel : public framework::OpKernel { @@ -1191,13 +1270,16 @@ class DistributedFusedLambOpKernel fp16_partial_fused_offsets_t->data(); VLOG(1) << "FusedParamOffsets: " - << FlattenToString(fused_offsets, fused_offsets_t->numel(), place); + << FlattenToString(fused_offsets, fused_offsets_t->numel(), + fused_offsets_t->place()); VLOG(1) << "FP32ShardFusedParamOffsets: " << FlattenToString(fp32_partial_fused_offsets, - fp32_partial_fused_offsets_t->numel(), place); + fp32_partial_fused_offsets_t->numel(), + fp32_partial_fused_offsets_t->place()); VLOG(1) << "FP16ShardFusedParamOffsets: " << FlattenToString(fp16_partial_fused_offsets, - fp16_partial_fused_offsets_t->numel(), place); + fp16_partial_fused_offsets_t->numel(), + fp16_partial_fused_offsets_t->place()); if (num_devices > 1) { if (use_master_param_norm) { @@ -1207,32 +1289,26 @@ class DistributedFusedLambOpKernel FillZeroWithPtr(trust_ratio_div_square_norm, param_num, stream); } } - CubDeviceSegmentedSquareNorm(fp32_param, param_square_norm, - fp32_global_param_num, fused_offsets, 0, - stream, &cub_tmp_buffer); + MultiTensorL2Norm(place, stream, fp32_param, fused_offsets, + fp32_global_param_num, param_square_norm); if (use_master_param_norm) { - CubDeviceSegmentedSquareNorm( - master_param + fp16_offset, param_square_norm + fp16_local_start_idx, - fp16_local_param_num, fp16_partial_fused_offsets, 0, stream, - &cub_tmp_buffer); + MultiTensorL2Norm(place, stream, master_param + fp16_offset, + fp16_partial_fused_offsets, fp16_local_param_num, + param_square_norm + fp16_local_start_idx); } else { // NOTE: extra computation is performed. We can improve this performance // if needed in the future. - CubDeviceSegmentedSquareNorm( - fp16_param, param_square_norm + fp32_global_param_num, - fp16_global_param_num, fused_offsets + fp32_global_param_num, - static_cast(fp32_numel), stream, &cub_tmp_buffer); + MultiTensorL2Norm( + place, stream, fp16_param, fused_offsets + fp32_global_param_num, + fp16_global_param_num, param_square_norm + fp32_global_param_num); } - CubDeviceSegmentedSquareNorm( - trust_ratio_div, trust_ratio_div_square_norm + fp32_local_start_idx, - fp32_local_param_num, fp32_partial_fused_offsets, 0, stream, - &cub_tmp_buffer); - CubDeviceSegmentedSquareNorm( - trust_ratio_div + fp32_numel_each_device, - trust_ratio_div_square_norm + fp16_local_start_idx, - fp16_local_param_num, fp16_partial_fused_offsets, 0, stream, - &cub_tmp_buffer); + MultiTensorL2Norm(place, stream, trust_ratio_div, + fp32_partial_fused_offsets, fp32_local_param_num, + trust_ratio_div_square_norm + fp32_local_start_idx); + MultiTensorL2Norm(place, stream, trust_ratio_div + fp32_numel_each_device, + fp16_partial_fused_offsets, fp16_local_param_num, + trust_ratio_div_square_norm + fp16_local_start_idx); VLOG(1) << "TrustRatioDiv L2-Norm before allreduce: " << FlattenToString(trust_ratio_div_square_norm, param_num, place); diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h new file mode 100644 index 0000000000000..5d8d03c733dae --- /dev/null +++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h @@ -0,0 +1,156 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "math.h" // NOLINT + +namespace paddle { +namespace operators { + +template +struct TensorMetaList { + static constexpr int kTensorNum = MaxTensorNumPerLaunch; + static constexpr int kChunkNum = MaxChunkNumPerLaunch; + + static_assert(kTensorNum > 0 && kTensorNum < 256, + "kTensorNum must be inside (0, 256)."); + static_assert(kChunkNum > 0 && kChunkNum < 65536, + "kChunkNum must be inside (0, 65536)."); + + /** + * The tensor numel offset of each tensor. + * The offsets[0] would be always 0 in the first launch, + * and then offsets[0] >= 0 in the following other launches. + * The numel of the i-th tensor would be offsets[i + 1] - offsets[i]. + */ + int offsets[kTensorNum + 1]; + + /** + * The tensor id of each chunk. The tensor_ids[0] is always 0. + * Note that tensor_ids would be always in the ascending order. + * The actual tensor id is start_tensor_id + tensor_ids[i]. + * + * The reason why we assume that the actual tensor id is + * start_tensor_id + tensor_ids[i] is to make tensor_ids to be + * a uint8_t array instead of an int array, making sizeof(TensorMetaList) + * smaller, so that kChunkNum can be larger. + */ + uint8_t tensor_ids[kChunkNum]; + + /** + * The chunk id of the chunk inside each tensor. It would be + * something like chunk_ids = [0, 1, 2, 0, 0, 1, 2, 3], meaning + * that there are 3 tensors and each tensor contains 3, 1 and 4 + * chunks. Note that chunk_ids[0] is always 0 and the actual + * chunk id of the first tensor is always start_chunk_id + chunk_ids[i]. + * + * The reason why we assume that the actual chunk id of the first + * tensor is always start_chunk_id + chunk_ids[i] is to make + * chunk_ids to be a uint16_t array instead of an int array, making + * sizeof(TensorMetaList) smaller, so that kChunkNum can be larger. + */ + uint16_t chunk_ids[kChunkNum]; + + /** + * The tensor_ids offset. + */ + int start_tensor_id; + + /** + * The chunk_ids offset. + */ + int start_chunk_id; +}; + +template +static __global__ void MultiTensorApplyCUDAKernel( + Functor functor, + TensorMetaList meta, + int chunk_size, Args... args) { + const int block_id = blockIdx.x; + const int tensor_id = meta.tensor_ids[block_id]; + const int chunk_id = static_cast(meta.chunk_ids[block_id]) + + (tensor_id == 0) * meta.start_chunk_id; + const int prev_offset = meta.offsets[tensor_id]; + const int next_offset = meta.offsets[tensor_id + 1]; + const int ptr_offset = prev_offset + chunk_id * chunk_size; + const int size = min(next_offset - ptr_offset, chunk_size); + + functor(tensor_id + meta.start_tensor_id, chunk_id, ptr_offset, size, + args...); +} + +template +static void MultiTensorApply(Functor functor, gpuStream_t stream, + const int *offsets, int n, int chunk_size, + Args... args) { + if (n == 0) return; + + constexpr auto NumTensor = MaxTensorNumPerLaunch; + constexpr auto NumChunk = MaxChunkNumPerLaunch; + TensorMetaList metas; + + int tensor_id = 0; + int chunk_id = 0; + int numel_offset = 0; + metas.start_tensor_id = 0; + metas.start_chunk_id = 0; + for (int i = 0; i < n; ++i) { + auto length = offsets[i + 1] - offsets[i]; + if (tensor_id == 0) { + metas.start_tensor_id = i; + metas.offsets[0] = numel_offset; + } + metas.offsets[tensor_id + 1] = metas.offsets[tensor_id] + length; + ++tensor_id; + numel_offset += length; + + auto chunk_num = (length + chunk_size - 1) / chunk_size; + int last_launch_chunk_id = 0; + for (int j = 0; j < chunk_num; ++j) { + metas.chunk_ids[chunk_id] = j - last_launch_chunk_id; + metas.tensor_ids[chunk_id] = tensor_id - 1; + ++chunk_id; + + bool tensor_full = (tensor_id == NumTensor && j + 1 == chunk_num); + bool block_full = (chunk_id == NumChunk); + bool last_chunk = (i + 1 == n && j + 1 == chunk_num); + + if (tensor_full || block_full || last_chunk) { + MultiTensorApplyCUDAKernel<<>>( + functor, metas, chunk_size, args...); + chunk_id = 0; + if (j + 1 == chunk_num) { // chunk for the current tensor is full + metas.start_chunk_id = 0; + tensor_id = 0; + } else { + metas.offsets[0] = metas.offsets[tensor_id - 1]; + metas.offsets[1] = metas.offsets[tensor_id]; + metas.start_tensor_id = i; + metas.start_chunk_id = j + 1; + last_launch_chunk_id = j + 1; + tensor_id = 1; + } + } + } + } +} + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py index 74c481fb641ac..e7c3cfbb7b93b 100644 --- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py +++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py @@ -178,11 +178,13 @@ def _apply_gradients_impl(self, params_grads): param_info = self._create_persistable_var('param_info', dtype='int32') param_info.is_distributed = True - fused_offsets = self._create_persistable_var('fused_offsets') + fused_offsets = self._create_persistable_var( + 'fused_offsets', dtype='int32') fp32_partial_fused_offsets = self._create_persistable_var( 'fp32_partial_fused_offsets', dtype='int32') fp32_partial_fused_offsets.is_distributed = True + fp16_partial_fused_offsets = self._create_persistable_var( 'fp16_partial_fused_offsets', dtype='int32') fp16_partial_fused_offsets.is_distributed = True From 584844ec25ea85dc657b70a86beaf2351c9d3147 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Fri, 25 Feb 2022 12:59:36 +0100 Subject: [PATCH 62/85] added logsoftmax oneDNN kernel (#39793) --- paddle/fluid/operators/log_softmax_op.cc | 18 +++- .../operators/mkldnn/log_softmax_mkldnn_op.cc | 78 ++++++++++++++++ .../inference/test_mkldnn_log_softmax_op.py | 63 +++++++++++++ .../mkldnn/test_log_softmax_mkldnn_op.py | 93 +++++++++++++++++++ .../fluid/tests/unittests/test_log_softmax.py | 2 +- 5 files changed, 250 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py diff --git a/paddle/fluid/operators/log_softmax_op.cc b/paddle/fluid/operators/log_softmax_op.cc index d6e2b3ecff8c8..0e69b397e04c7 100644 --- a/paddle/fluid/operators/log_softmax_op.cc +++ b/paddle/fluid/operators/log_softmax_op.cc @@ -31,9 +31,17 @@ class LogSoftmaxOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -48,6 +56,10 @@ class LogSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker { "The dimension index of Input(x) to perform log_softmax," "default -1 for last dimension") .SetDefault(-1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( LogSoftmax Operator. diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc new file mode 100644 index 0000000000000..450462e7d4bb9 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/softmax_op.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +template +class LogSoftmaxMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* x, + const int axis) + : platform::MKLDNNHandlerNoCachingT( + mkldnn_engine, cpu_place) { + const auto logsoftmax_tz = phi::vectorize(x->dims()); + const auto md = dnnl::memory::desc( + logsoftmax_tz, platform::MKLDNNGetDataType(), x->format()); + + this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_inference, + md, axis); + } +}; + +template +class LogSoftmaxMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const Tensor* x = ctx.Input("X"); + Tensor* out = ctx.Output("Out"); + + int axis = ctx.Attr("axis"); + axis = axis >= 0 ? axis : x->dims().size() + axis; + + LogSoftmaxMKLDNNHandler handler(mkldnn_engine, ctx.GetPlace(), x, axis); + + auto src_memory_p = handler.AcquireSrcMemory(x); + auto dst_memory_p = handler.AcquireDstMemory(out); + + auto logsoftmax_p = handler.AcquireForwardPrimitive(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + logsoftmax_p->execute(astream, {{DNNL_ARG_SRC, *src_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(x->format()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(log_softmax, MKLDNN, ::paddle::platform::CPUPlace, + ops::LogSoftmaxMKLDNNKernel, + ops::LogSoftmaxMKLDNNKernel); diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py new file mode 100644 index 0000000000000..3dc0623a112f5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_log_softmax_op.py @@ -0,0 +1,63 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from auto_scan_test import MkldnnAutoScanTest +from program_config import TensorConfig, ProgramConfig, OpConfig +import numpy as np +from functools import partial +import unittest +from hypothesis import given +import hypothesis.strategies as st + + +class TestMKLDNNLogSoftmaxOp(MkldnnAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self, *args, **kwargs): + def generate_input(*args, **kwargs): + return np.random.random(kwargs['in_shape']).astype(np.float32) + + logsoftmax_op = OpConfig( + type="log_softmax", + inputs={"X": ["input_data"]}, + outputs={"Out": ["output_data"]}, + attrs={"axis": kwargs['axis']}) + + program_config = ProgramConfig( + ops=[logsoftmax_op], + weights={}, + inputs={ + "input_data": TensorConfig(data_gen=partial(generate_input, + *args, **kwargs)), + }, + outputs=["output_data"]) + + yield program_config + + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_mkldnn=True) + yield config, (1e-5, 1e-5) + + @given( + axis=st.sampled_from([-2, -1, 0, 1]), + in_shape=st.lists( + st.integers( + min_value=2, max_value=5), min_size=3, max_size=5)) + def test(self, *args, **kwargs): + self.run_test(quant=False, *args, **kwargs) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py new file mode 100644 index 0000000000000..7477eaf3339b2 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_log_softmax_mkldnn_op.py @@ -0,0 +1,93 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +from paddle.fluid import core +from paddle.fluid.tests.unittests.test_log_softmax import ref_log_softmax +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestLogSoftmaxOneDNNOp(OpTest): + def setUp(self): + self.op_type = 'log_softmax' + self.set_dtype() + self.set_shape() + self.set_axis() + + x = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32) + out = np.apply_along_axis(ref_log_softmax, self.axis, x) + + if self.dtype == np.uint16: + x = convert_float_to_uint16(x) + + self.inputs = {'X': x} + self.outputs = {'Out': out} + self.attrs = {'axis': self.axis, 'use_mkldnn': True} + + def set_dtype(self): + self.dtype = np.float32 + + def set_shape(self): + self.shape = [2, 3, 4, 5] + + def set_axis(self): + self.axis = -1 + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + +class TestLogSoftmax1DOneDNNOp(TestLogSoftmaxOneDNNOp): + def set_shape(self): + self.shape = [100] + + +class TestLogSoftmax3DOneDNNOp(TestLogSoftmaxOneDNNOp): + def set_shape(self): + self.shape = [12, 10, 3] + + +class TestLogSoftmax5DOneDNNOp(TestLogSoftmaxOneDNNOp): + def set_shape(self): + self.shape = [2, 3, 4, 5, 6] + + +class TestLogSoftmaxPositiveAxisOneDNNOp(TestLogSoftmaxOneDNNOp): + def set_axis(self): + self.axis = 2 + + +# BF16 TESTS +class TestLogSoftmax1DBF16OneDNNOp(TestLogSoftmax1DOneDNNOp): + def set_dtype(self): + self.dtype = np.uint16 + + +class TestLogSoftmaxPositiveAxisBF16OneDNNOp( + TestLogSoftmaxPositiveAxisOneDNNOp): + def set_dtype(self): + self.dtype = np.uint16 + + +class TestLogSoftmax5DBF16OneDNNOp(TestLogSoftmax5DOneDNNOp): + def set_shape(self): + self.shape = [2, 3, 4, 5, 6] + + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_log_softmax.py b/python/paddle/fluid/tests/unittests/test_log_softmax.py index 0dd6c9f893e2a..d1437ca9c96f1 100644 --- a/python/paddle/fluid/tests/unittests/test_log_softmax.py +++ b/python/paddle/fluid/tests/unittests/test_log_softmax.py @@ -14,7 +14,7 @@ import unittest import numpy as np -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest import paddle import paddle.nn.functional as F From 687902fcfd89c7648ab68d6e57041cfb7ae20fc8 Mon Sep 17 00:00:00 2001 From: Feiyu Chan Date: Fri, 25 Feb 2022 22:45:32 +0800 Subject: [PATCH 63/85] [phi] update code for mkl based fft (#39889) --- paddle/fluid/operators/spectral_op.cc | 75 ++++++++++++++------------- paddle/fluid/platform/dynload/mklrt.h | 3 +- 2 files changed, 40 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index fe76448a185c9..db3dc214bfe7a 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -25,9 +25,10 @@ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/platform/complex.h" +#include "paddle/phi/kernels/funcs/complex_functors.h" #if defined(PADDLE_WITH_ONEMKL) -#include "paddle/fluid/platform/dynload/mklrt.h" +#include "paddle/phi/backends/dynload/mklrt.h" #elif defined(PADDLE_WITH_POCKETFFT) #include "extern_pocketfft/pocketfft_hdronly.h" #endif @@ -357,12 +358,12 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { // FFT Functors #if defined(PADDLE_WITH_ONEMKL) -#define MKL_DFTI_CHECK(expr) \ - do { \ - MKL_LONG status = (expr); \ - if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ - PADDLE_THROW(platform::errors::External( \ - platform::dynload::DftiErrorMessage(status))); \ +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!phi::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW( \ + platform::errors::External(phi::dynload::DftiErrorMessage(status))); \ } while (0); namespace { @@ -370,7 +371,7 @@ namespace { struct DftiDescriptorDeleter { void operator()(DFTI_DESCRIPTOR_HANDLE handle) { if (handle != nullptr) { - MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle)); + MKL_DFTI_CHECK(phi::dynload::DftiFreeDescriptor(&handle)); } } }; @@ -385,7 +386,7 @@ class DftiDescriptor { "DftiDescriptor has already been initialized.")); DFTI_DESCRIPTOR* raw_desc; - MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX( + MKL_DFTI_CHECK(phi::dynload::DftiCreateDescriptorX( &raw_desc, precision, signal_type, signal_ndim, sizes)); desc_.reset(raw_desc); } @@ -437,21 +438,21 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); // placement inplace or not inplace - MKL_DFTI_CHECK(platform::dynload::DftiSetValue( - descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), DFTI_PLACEMENT, + DFTI_NOT_INPLACE)); // number of transformations const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); // input & output distance const MKL_LONG idist = in_strides[0]; const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), - DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), - DFTI_OUTPUT_DISTANCE, odist)); + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(phi::dynload::DftiSetValue(descriptor.get(), + DFTI_OUTPUT_DISTANCE, odist)); // input & output stride std::vector mkl_in_stride(1 + signal_ndim, 0); @@ -460,14 +461,14 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, mkl_in_stride[i] = in_strides[i]; mkl_out_stride[i] = out_strides[i]; } - MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); // conjugate even storage if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + MKL_DFTI_CHECK(phi::dynload::DftiSetValue( descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); } @@ -489,12 +490,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, return DFTI_BACKWARD_SCALE; } }(); - MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), - scale_direction, scale)); + MKL_DFTI_CHECK( + phi::dynload::DftiSetValue(descriptor.get(), scale_direction, scale)); } // commit the descriptor - MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get())); + MKL_DFTI_CHECK(phi::dynload::DftiCommitDescriptor(descriptor.get())); return descriptor; } @@ -575,39 +576,39 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, framework::TransToProtoVarType(out->dtype()), input_stride, output_stride, signal_sizes, normalization, forward); - const FFTTransformType fft_type = GetFFTTransformType(x->type(), out->type()); + const FFTTransformType fft_type = + GetFFTTransformType(framework::TransToProtoVarType(x->dtype()), + framework::TransToProtoVarType(out->type())); if (fft_type == FFTTransformType::C2R && forward) { - framework::Tensor collapsed_input_conj( - framework::TransToProtoVarType(collapsed_input.dtype())); + framework::Tensor collapsed_input_conj(collapsed_input.dtype()); collapsed_input_conj.mutable_data(collapsed_input.dims(), ctx.GetPlace()); // conjugate the input platform::ForRange for_range(ctx, collapsed_input.numel()); - math::ConjFunctor functor(collapsed_input.data(), - collapsed_input.numel(), - collapsed_input_conj.data()); + phi::funcs::ConjFunctor functor(collapsed_input.data(), + collapsed_input.numel(), + collapsed_input_conj.data()); for_range(functor); - MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( desc.get(), collapsed_input_conj.data(), collapsed_output.data())); } else if (fft_type == FFTTransformType::R2C && !forward) { - framework::Tensor collapsed_output_conj( - framework::TransToProtoVarType(collapsed_output.dtype())); + framework::Tensor collapsed_output_conj(collapsed_output.dtype()); collapsed_output_conj.mutable_data(collapsed_output.dims(), ctx.GetPlace()); - MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( desc.get(), collapsed_input.data(), collapsed_output_conj.data())); // conjugate the output platform::ForRange for_range(ctx, collapsed_output.numel()); - math::ConjFunctor functor(collapsed_output_conj.data(), - collapsed_output.numel(), - collapsed_output.data()); + phi::funcs::ConjFunctor functor(collapsed_output_conj.data(), + collapsed_output.numel(), + collapsed_output.data()); for_range(functor); } else { if (forward) { - MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + MKL_DFTI_CHECK(phi::dynload::DftiComputeForward( desc.get(), collapsed_input.data(), collapsed_output.data())); } else { - MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + MKL_DFTI_CHECK(phi::dynload::DftiComputeBackward( desc.get(), collapsed_input.data(), collapsed_output.data())); } } diff --git a/paddle/fluid/platform/dynload/mklrt.h b/paddle/fluid/platform/dynload/mklrt.h index 3b7d23277e065..334b98a1c3d5a 100644 --- a/paddle/fluid/platform/dynload/mklrt.h +++ b/paddle/fluid/platform/dynload/mklrt.h @@ -17,7 +17,8 @@ limitations under the License. */ #include #include // NOLINT -#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/dynamic_loader.h" +#include "paddle/phi/backends/dynload/mklrt.h" #include "paddle/phi/backends/dynload/port.h" namespace paddle { From 94d8f39284bb30f837218d3d21605f030dd4f3fc Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Fri, 25 Feb 2022 23:24:37 +0800 Subject: [PATCH 64/85] move for_range into phi (#39931) --- paddle/fluid/platform/for_range.h | 127 +---------------- paddle/phi/kernels/cpu/abs_kernel.cc | 4 +- paddle/phi/kernels/funcs/diagonal.h | 4 +- paddle/phi/kernels/funcs/elementwise_base.h | 4 +- paddle/phi/kernels/funcs/for_range.h | 129 ++++++++++++++++++ paddle/phi/kernels/gpu/poisson_kernel.cu | 4 +- .../phi/kernels/impl/abs_grad_kernel_impl.h | 6 +- .../phi/kernels/impl/atan2_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/atan2_kernel_impl.h | 4 +- .../kernels/impl/complex_grad_kernel_impl.h | 6 +- paddle/phi/kernels/impl/complex_kernel_impl.h | 8 +- .../kernels/impl/digamma_grad_kernel_impl.h | 4 +- paddle/phi/kernels/impl/digamma_kernel_impl.h | 4 +- .../phi/kernels/impl/trace_grad_kernel_impl.h | 4 +- 14 files changed, 160 insertions(+), 152 deletions(-) create mode 100644 paddle/phi/kernels/funcs/for_range.h diff --git a/paddle/fluid/platform/for_range.h b/paddle/fluid/platform/for_range.h index f3f7064efeeb2..abc427a3ca881 100644 --- a/paddle/fluid/platform/for_range.h +++ b/paddle/fluid/platform/for_range.h @@ -13,136 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" + #include "paddle/fluid/platform/device_context.h" -#include "paddle/phi/backends/cpu/cpu_context.h" -#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace paddle { namespace platform { template -struct ForRange { - ForRange(const DeviceContext& dev_ctx, size_t limit); - - template - void operator()(Function func) const; -}; - -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} - - template - void operator()(Function func) const { - for (size_t i = 0; i < limit_; ++i) { - func(i); - } - } - - size_t limit_; -}; - -template <> -struct ForRange { - ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {} - - template - void operator()(Function func) const { - for (size_t i = 0; i < limit_; ++i) { - func(i); - } - } - - size_t limit_; -}; - -#if defined(__NVCC__) || defined(__HIPCC__) -template -__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { - size_t idx = static_cast(threadIdx.x); - func(idx); -} - -template -__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { - size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - if (idx < limit) { - func(idx); - } -} - -// NOTE: After the pten kernel is migrated, it needs to be deleted. -template <> -struct ForRange { - ForRange(const CUDADeviceContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} - - template - inline void operator()(Function func) const { -#ifdef __HIPCC__ - // HIP will throw core dump when threads > 256 - constexpr int num_threads = 256; -#elif WITH_NV_JETSON - // JETSON_NANO will throw core dump when threads > 128 - int num_thread = 256; - platform::ChangeThreadNum(dev_ctx_, &num_thread, 128); - const int num_threads = num_thread; -#else - constexpr int num_threads = 1024; -#endif - size_t block_size = limit_ <= num_threads ? limit_ : num_threads; - size_t grid_size = (limit_ + num_threads - 1) / num_threads; - - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); - } else { - ForRangeElemwiseOp<<>>( - func, limit_); - } - } - - const CUDADeviceContext& dev_ctx_; - size_t limit_; -}; - -template <> -struct ForRange { - ForRange(const phi::GPUContext& dev_ctx, size_t limit) - : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} - - template - inline void operator()(Function func) const { -#ifdef __HIPCC__ - // HIP will throw core dump when threads > 256 - constexpr int num_threads = 256; -#elif WITH_NV_JETSON - // JETSON_NANO will throw core dump when threads > 128 - int num_thread = 256; - platform::ChangeThreadNum(dev_ctx_, &num_thread, 128); - const int num_threads = num_thread; -#else - constexpr int num_threads = 1024; -#endif - size_t block_size = limit_ <= num_threads ? limit_ : num_threads; - size_t grid_size = (limit_ + num_threads - 1) / num_threads; - - if (grid_size == 1) { - ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( - func); - } else { - ForRangeElemwiseOp<<>>( - func, limit_); - } - } - - const phi::GPUContext& dev_ctx_; - size_t limit_; -}; - -#endif +using ForRange = phi::funcs::ForRange; } // namespace platform } // namespace paddle diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index 71d818c45e6f3..efe7d090405df 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -13,11 +13,11 @@ // limitations under the License. #include "paddle/phi/kernels/abs_kernel.h" -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -29,7 +29,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { out, size_t(x.numel() * sizeof(phi::funcs::Real))); auto* out_data = out->data>(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/funcs/diagonal.h b/paddle/phi/kernels/funcs/diagonal.h index a82c4f66d0102..19a93970d090a 100644 --- a/paddle/phi/kernels/funcs/diagonal.h +++ b/paddle/phi/kernels/funcs/diagonal.h @@ -22,8 +22,8 @@ #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { namespace funcs { @@ -118,7 +118,7 @@ DenseTensor Diagonal(const DeviceContext& context, #endif // auto& dev_ctx = context.template device_context(); - paddle::platform::ForRange for_range(context, diag.numel()); + phi::funcs::ForRange for_range(context, diag.numel()); DiagonalFunctor functor( input_data, diag_arr, ret_arr, pos, dim_size, diag_data); for_range(functor); diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h index 47f1593a11eb9..d369781f845eb 100644 --- a/paddle/phi/kernels/funcs/elementwise_base.h +++ b/paddle/phi/kernels/funcs/elementwise_base.h @@ -14,11 +14,11 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/transform.h" #include "paddle/phi/backends/all_context.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/empty_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__) @@ -418,7 +418,7 @@ void ElemwiseGradComputeNoBroadcast(const DeviceContext &dev_ctx, DX_OP dx_op, DY_OP dy_op) { size_t N = static_cast(phi::product(x_dim)); - paddle::platform::ForRange for_range(dev_ctx, N); + phi::funcs::ForRange for_range(dev_ctx, N); for_range(ElemwiseGradNoBroadcast{ x.data(), y.data(), diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h new file mode 100644 index 0000000000000..bf0888c301fe7 --- /dev/null +++ b/paddle/phi/kernels/funcs/for_range.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/phi/backends/all_context.h" +#include "paddle/phi/backends/gpu/gpu_launch_config.h" + +namespace phi { +namespace funcs { + +template +struct ForRange { + ForRange(const Context& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const phi::CPUContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +// NOTE: After the pten kernel is migrated, it needs to be deleted. +template <> +struct ForRange { + ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + void operator()(Function func) const { + phi::funcs::ForRange for_range(dev_ctx_, limit_); + for_range(func); + } + + const paddle::platform::CPUDeviceContext& dev_ctx_; + size_t limit_; +}; + +#if defined(__NVCC__) || defined(__HIPCC__) + +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, size_t limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const phi::GPUContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + inline void operator()(Function func) const { +#ifdef __HIPCC__ + // HIP will throw core dump when threads > 256 + constexpr int num_threads = 256; +#elif WITH_NV_JETSON + // JETSON_NANO will throw core dump when threads > 128 + int num_thread = 256; + backends::gpu::ChangeThreadNum(dev_ctx_, &num_thread, 128); + const int num_threads = num_thread; +#else + constexpr int num_threads = 1024; +#endif + size_t block_size = limit_ <= num_threads ? limit_ : num_threads; + size_t grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const phi::GPUContext& dev_ctx_; + size_t limit_; +}; + +// NOTE: After the pten kernel is migrated, it needs to be deleted. +template <> +struct ForRange { + ForRange(const paddle::platform::CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(limit) {} + + template + inline void operator()(Function func) const { + phi::funcs::ForRange for_range(dev_ctx_, limit_); + for_range(func); + } + + const paddle::platform::CUDADeviceContext& dev_ctx_; + size_t limit_; +}; + +#endif + +} // namespace funcs +} // namespace phi diff --git a/paddle/phi/kernels/gpu/poisson_kernel.cu b/paddle/phi/kernels/gpu/poisson_kernel.cu index ae97f2fca68cb..347f70b166657 100644 --- a/paddle/phi/kernels/gpu/poisson_kernel.cu +++ b/paddle/phi/kernels/gpu/poisson_kernel.cu @@ -19,9 +19,9 @@ limitations under the License. */ #include #endif -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/poisson_kernel.h" namespace phi { @@ -65,7 +65,7 @@ void PoissonKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { uint64_t seed = seed_offset.first; uint64_t offset = seed_offset.second; - paddle::platform::ForRange for_range(ctx, size); + phi::funcs::ForRange for_range(ctx, size); PoissonCudaFunctor functor(x_data, out_data, seed, offset); for_range(functor); diff --git a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h index 4b31393a71f36..78c25200bbd28 100644 --- a/paddle/phi/kernels/impl/abs_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/abs_grad_kernel_impl.h @@ -14,10 +14,10 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/abs_grad_kernel.h" #include "paddle/phi/kernels/funcs/complex_functors.h" #include "paddle/phi/kernels/funcs/elementwise_base.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -53,7 +53,7 @@ void AbsGradKernel(const Context& ctx, ctx.template Alloc(dx, static_cast(numel * sizeof(T))); auto* dx_data = dx->data(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsGradFunctor functor(dout_data, x_data, dx_data, numel); for_range(functor); } @@ -70,7 +70,7 @@ void AbsDoubleGradKernel(const Context& ctx, ctx.template Alloc(ddout, static_cast(numel * sizeof(T))); auto* ddout_data = ddout->data(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::funcs::AbsGradGradFunctor functor( ddx_data, x_data, ddout_data, numel); for_range(functor); diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h index 5f75a95f4a7b1..d0dd18298518a 100644 --- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_grad_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -85,7 +85,7 @@ void Atan2GradKernel(const Context& ctx, auto* y_grad_data = ctx.template Alloc(y_grad, size_t(y.numel() * sizeof(T))); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::Atan2GradFunctor functor( x_data, y_data, out_grad_data, x_grad_data, y_grad_data, numel); for_range(functor); diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h index c29449a27e0b5..2cae914e2f615 100644 --- a/paddle/phi/kernels/impl/atan2_kernel_impl.h +++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h @@ -14,9 +14,9 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/kernels/atan2_kernel.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { template @@ -80,7 +80,7 @@ void Atan2Kernel(const Context& ctx, auto* out_data = ctx.template Alloc::type>( out, size_t(x.numel() * sizeof(typename Atan2Out::type))); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); phi::Atan2Functor functor(x_data, y_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h index febc464e6a1f5..a10481284b17f 100644 --- a/paddle/phi/kernels/impl/complex_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_grad_kernel_impl.h @@ -14,8 +14,8 @@ #pragma once -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -28,7 +28,7 @@ void RealGradKernel(const Context& dev_ctx, auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } @@ -42,7 +42,7 @@ void ImagGradKernel(const Context& dev_ctx, auto* dx_data = dev_ctx.template Alloc(dx, static_cast(numel * sizeof(T))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagToComplexFunctor functor(dout_data, dx_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/complex_kernel_impl.h b/paddle/phi/kernels/impl/complex_kernel_impl.h index 2f9b1ad046653..ff5cf86ed2ea2 100644 --- a/paddle/phi/kernels/impl/complex_kernel_impl.h +++ b/paddle/phi/kernels/impl/complex_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once // See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/kernels/funcs/complex_functors.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -28,7 +28,7 @@ void ConjKernel(const Context& dev_ctx, auto* x_data = x.data(); auto* out_data = dev_ctx.template Alloc(out); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ConjFunctor functor(x_data, numel, out_data); for_range(functor); } @@ -42,7 +42,7 @@ void RealKernel(const Context& dev_ctx, auto* out_data = dev_ctx.template Alloc>( out, static_cast(numel * sizeof(phi::funcs::Real))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::RealFunctor functor(x_data, out_data, numel); for_range(functor); } @@ -56,7 +56,7 @@ void ImagKernel(const Context& dev_ctx, auto* out_data = dev_ctx.template Alloc>( out, static_cast(numel * sizeof(phi::funcs::Real))); - paddle::platform::ForRange for_range(dev_ctx, numel); + phi::funcs::ForRange for_range(dev_ctx, numel); phi::funcs::ImagFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h index f94fe7168b2a5..74ded1569eb58 100644 --- a/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_grad_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -47,7 +47,7 @@ void DigammaGradKernel(const Context& ctx, auto* x_data = x.data(); auto* dx_data = x_grad->data(); auto numel = out_grad.numel(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); DigammaGradFunctor functor(dout_data, x_data, dx_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/digamma_kernel_impl.h b/paddle/phi/kernels/impl/digamma_kernel_impl.h index 5a924a322d6e9..8994979e64d70 100644 --- a/paddle/phi/kernels/impl/digamma_kernel_impl.h +++ b/paddle/phi/kernels/impl/digamma_kernel_impl.h @@ -15,8 +15,8 @@ #pragma once #include -#include "paddle/fluid/platform/for_range.h" #include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/for_range.h" namespace phi { @@ -41,7 +41,7 @@ void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) { auto* x_data = x.data(); auto* out_data = out->data(); auto numel = x.numel(); - paddle::platform::ForRange for_range(ctx, numel); + phi::funcs::ForRange for_range(ctx, numel); DigammaFunctor functor(x_data, out_data, numel); for_range(functor); } diff --git a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h index 5263f92cb578b..b0878d779462a 100644 --- a/paddle/phi/kernels/impl/trace_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/trace_grad_kernel_impl.h @@ -21,7 +21,7 @@ #include -#include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/kernels/funcs/for_range.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace phi { @@ -130,7 +130,7 @@ void TraceGradKernel(const Context& ctx, const auto* input_arr = input_stride.Get(); #endif - paddle::platform::ForRange for_range(ctx, in_grad->numel()); + phi::funcs::ForRange for_range(ctx, in_grad->numel()); TraceGradFunctor functor(out_data, output_arr, input_arr, From ab872efef2ff7fd75ab6dcdbcc6a11d69f23dcfa Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Sat, 26 Feb 2022 09:17:30 +0800 Subject: [PATCH 65/85] fix mkldnn softmax erro (#39951) --- paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc index 450462e7d4bb9..626d3ef40b166 100644 --- a/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/log_softmax_mkldnn_op.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/softmax_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { From a456dda6a3a83b410ef51e38d8b814b747c6af36 Mon Sep 17 00:00:00 2001 From: Weilong Wu Date: Sat, 26 Feb 2022 10:37:12 +0800 Subject: [PATCH 66/85] [Eager Hook] Support GradientHook and ReduceHook, expose related interface to python (#39893) * Support Eager Hook, expose interface to python * Fix CI issue --- .../eager/accumulation/accumulation_node.cc | 6 +- .../eager/accumulation/accumulation_node.h | 5 +- paddle/fluid/eager/api/utils/hook_utils.cc | 51 +++--- paddle/fluid/eager/api/utils/hook_utils.h | 8 +- .../auto_code_generator/eager_generator.cc | 7 +- paddle/fluid/eager/grad_node_info.cc | 25 ++- paddle/fluid/eager/grad_node_info.h | 31 +++- paddle/fluid/eager/hooks.h | 63 +++++++ .../accumulation_node_test.cc | 7 +- .../grad_node_info_test.cc | 21 ++- .../tests/task_tests/fwd_bwd_joint_test.cc | 20 +- .../fluid/eager/tests/task_tests/hook_test.cc | 25 +-- .../task_tests/hook_test_intermidiate.cc | 84 ++++++--- paddle/fluid/pybind/eager_method.cc | 172 ++++++++++++++++++ .../fluid/dygraph/varbase_patch_methods.py | 5 +- .../unittests/test_tensor_register_hook.py | 94 ++++++++-- 16 files changed, 488 insertions(+), 136 deletions(-) create mode 100644 paddle/fluid/eager/hooks.h diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc index 2e377e43ca3ec..3a2ec403c0a59 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.cc +++ b/paddle/fluid/eager/accumulation/accumulation_node.cc @@ -76,13 +76,13 @@ operator()( } void GradNodeAccumulation::RegisterReduceHook( - const std::function& hook) { - reduce_hooks_.emplace_back(hook); + std::shared_ptr&& hook) { + reduce_hooks_.emplace_back(std::move(hook)); } void GradNodeAccumulation::ApplyReduceHooks() { for (auto& hook : reduce_hooks_) { - hook(); + (*hook)(); } } } // namespace egr diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h index 787149ab30526..734cabdc3dc91 100644 --- a/paddle/fluid/eager/accumulation/accumulation_node.h +++ b/paddle/fluid/eager/accumulation/accumulation_node.h @@ -16,6 +16,7 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" namespace egr { @@ -39,7 +40,7 @@ class GradNodeAccumulation : public GradNodeBase { /** * Register ReduceHook * **/ - void RegisterReduceHook(const std::function& hook); + void RegisterReduceHook(std::shared_ptr&& hook); /** * Apply ReduceHook here @@ -54,7 +55,7 @@ class GradNodeAccumulation : public GradNodeBase { const paddle::experimental::Tensor&)> retain_grad_hook_; - std::vector> reduce_hooks_; + std::vector> reduce_hooks_; }; } // namespace egr diff --git a/paddle/fluid/eager/api/utils/hook_utils.cc b/paddle/fluid/eager/api/utils/hook_utils.cc index 748afe6d1f313..c792771630052 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.cc +++ b/paddle/fluid/eager/api/utils/hook_utils.cc @@ -22,19 +22,19 @@ namespace egr { namespace egr_utils_api { -void RegisterGradientHookForTensor( +int64_t RegisterGradientHookForTensor( const paddle::experimental::Tensor& tensor, - std::function& hook) { + std::shared_ptr&& hook) { // Find grad_node and out_rank from AutogradMeta std::shared_ptr grad_node = EagerUtils::grad_node(tensor); auto rank_info = EagerUtils::unsafe_autograd_meta(tensor)->OutRankInfo(); - grad_node->RegisterGradientHook(rank_info.first, rank_info.second, hook); + return grad_node->RegisterGradientHook(rank_info.first, rank_info.second, + std::move(hook)); } void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, - const std::function& hook) { + std::shared_ptr&& hook) { if (IsLeafTensor(tensor)) { VLOG(6) << "Register ReduceHook for leaf tensor"; std::shared_ptr grad_node = EagerUtils::grad_node(tensor); @@ -45,7 +45,7 @@ void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, "with type: GradNodeAccumulation")); auto accumulation_grad_node = std::dynamic_pointer_cast(grad_node); - accumulation_grad_node->RegisterReduceHook(hook); + accumulation_grad_node->RegisterReduceHook(std::move(hook)); } else { PADDLE_THROW(paddle::platform::errors::Fatal( "Only can register reduce hook for leaf Tensor.")); @@ -65,28 +65,27 @@ static void RetainGradForRegularNode( meta->WeakGrad(); // Define Hook - std::function - hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { - if (!weak_grad_tensor.expired()) { - auto grad_tensor = weak_grad_tensor.lock(); - if (t.defined()) { - VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); - // Simply Copy impl() to grad_tensor - grad_tensor->set_impl(t.impl()); - return *grad_tensor.get(); - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - } else { - VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; - return paddle::experimental::Tensor(); - } - }; + auto hook = [weak_grad_tensor](const paddle::experimental::Tensor& t) { + if (!weak_grad_tensor.expired()) { + auto grad_tensor = weak_grad_tensor.lock(); + if (t.defined()) { + VLOG(7) << "Set impl for RetainGrad Hook for tensor: " << t.name(); + // Simply Copy impl() to grad_tensor + grad_tensor->set_impl(t.impl()); + return *grad_tensor.get(); + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } + } else { + VLOG(7) << "Retain NULL paddle::experimental::Tensor in Grad Hook"; + return paddle::experimental::Tensor(); + } + }; // Append to GradientHooks - RegisterGradientHookForTensor(tensor, hook); + RegisterGradientHookForTensor(tensor, + std::make_shared(hook)); } void RetainGradForTensor(const paddle::experimental::Tensor& tensor) { diff --git a/paddle/fluid/eager/api/utils/hook_utils.h b/paddle/fluid/eager/api/utils/hook_utils.h index 4c4ecc9fb801d..b36ef81125a8c 100644 --- a/paddle/fluid/eager/api/utils/hook_utils.h +++ b/paddle/fluid/eager/api/utils/hook_utils.h @@ -16,17 +16,17 @@ #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/all.h" namespace egr { namespace egr_utils_api { -void RegisterGradientHookForTensor( +int64_t RegisterGradientHookForTensor( const paddle::experimental::Tensor& tensor, - std::function& hook); + std::shared_ptr&& hook); void RegisterReduceHookForTensor(const paddle::experimental::Tensor& tensor, - const std::function& hook); + std::shared_ptr&& hook); void RetainGradForTensor(const paddle::experimental::Tensor& tensor); } // namespace egr_utils_api diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index e1f4d6ee9a129..74c5bcdb20984 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2040,12 +2040,13 @@ static std::string GenerateGradNodeCCContents( const char* BWD_RETURN_TEMPLATE = " std::vector> hooked_grads = " - "egr::GradNodeBase::ApplyGradientHooks(grads);\n" + "GradNode%s::ApplyGradientHooks(grads);\n" " std::vector> outputs(%d);\n" " %s\n" " return outputs;\n"; - generated_grad_function_body = paddle::string::Sprintf( - BWD_RETURN_TEMPLATE, in_vars.size(), generated_grad_function_body); + generated_grad_function_body = + paddle::string::Sprintf(BWD_RETURN_TEMPLATE, fwd_op_type, in_vars.size(), + generated_grad_function_body); // [Generation] Get Full Grad Function const char* GRAD_FUNCTION_TEMPLATE = diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc index 27c376b4c80c6..35416281f1888 100644 --- a/paddle/fluid/eager/grad_node_info.cc +++ b/paddle/fluid/eager/grad_node_info.cc @@ -210,22 +210,22 @@ const std::vector>& GradNodeBase::GetEdges() const { return adj_edges_; } -void GradNodeBase::RegisterGradientHook( - size_t slot_id, size_t rank, - const std::function& hook) { - gradient_hooks_.emplace_back(std::make_tuple(slot_id, rank, hook)); +int64_t GradNodeBase::RegisterGradientHook( + size_t slot_id, size_t rank, std::shared_ptr&& hook) { + gradient_hooks_.emplace(next_hook_id_, + std::make_tuple(slot_id, rank, std::move(hook))); + return next_hook_id_++; } std::vector> GradNodeBase::ApplyGradientHooks( const std::vector>& tensors) { std::vector> outs(tensors.size()); - for (auto& tuple : gradient_hooks_) { - size_t slot_id = std::get<0>(tuple); - size_t rank = std::get<1>(tuple); - std::function& hook = std::get<2>(tuple); + for (auto& hook_pair : gradient_hooks_) { + size_t slot_id = std::get<0>(hook_pair.second); + size_t rank = std::get<1>(hook_pair.second); + + auto hook = std::get<2>(hook_pair.second); PADDLE_ENFORCE(slot_id < tensors.size(), paddle::platform::errors::Fatal( @@ -242,12 +242,11 @@ GradNodeBase::ApplyGradientHooks( slot_out.resize(tensors[slot_id].size()); paddle::experimental::Tensor& out = slot_out[rank]; if (!out.defined() || !out.initialized()) { - VLOG(8) << "Run Hook for tensor: " << tensors[slot_id][rank].name(); - out = hook(tensors[slot_id][rank]); + out = (*hook)(tensors[slot_id][rank]); } else { // If more than one hook is registered, the input to the next hook func // should be the output of the previous hook - out = hook(out); + out = (*hook)(out); } } diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h index f699f9ab28e2d..eeac1cca4acf3 100644 --- a/paddle/fluid/eager/grad_node_info.h +++ b/paddle/fluid/eager/grad_node_info.h @@ -15,6 +15,7 @@ #pragma once #include "paddle/fluid/eager/eager_tensor.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/all.h" namespace egr { @@ -135,14 +136,24 @@ class GradNodeBase { /** * Register GradientHook * **/ - void RegisterGradientHook(size_t slot_id, size_t rank, - const std::function& hook); + int64_t RegisterGradientHook(size_t slot_id, size_t rank, + std::shared_ptr&& hook); + + /** + * Remove GradientHook + * **/ + bool RemoveGradientHook(const int64_t& hook_id) { + auto remove_cnt = gradient_hooks_.erase(hook_id); + if (remove_cnt == 0) { + return false; + } + return true; + } /** * Apply GradientHook * **/ - inline bool GradientHooksRegistered() { return gradient_hooks_.size() != 0; } + inline bool GradientHooksRegistered() { return !gradient_hooks_.empty(); } std::vector> ApplyGradientHooks( const std::vector>& tensors); @@ -166,12 +177,14 @@ class GradNodeBase { // Gradient Hooks // Customer may register a list of hooks which will be called in order during // backward - // Each entry consists one pair of - std::vector>> + // Each entry consists one pair of + // >> + std::map>> gradient_hooks_; + + int64_t next_hook_id_{0}; }; class Edge { diff --git a/paddle/fluid/eager/hooks.h b/paddle/fluid/eager/hooks.h new file mode 100644 index 0000000000000..097150cf5ed59 --- /dev/null +++ b/paddle/fluid/eager/hooks.h @@ -0,0 +1,63 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include "paddle/phi/api/include/tensor.h" +namespace egr { + +class TensorHook { + public: + virtual ~TensorHook() = default; + virtual paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) = 0; +}; + +class TensorVoidHook { + public: + virtual ~TensorVoidHook() = default; + virtual void operator()() = 0; +}; + +class CppTensorHook : public TensorHook { + public: + explicit CppTensorHook(std::function&& fn) + : fn_(std::move(fn)) {} + + paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) override { + return fn_(var); + } + + private: + std::function + fn_; +}; + +class CppTensorVoidHook : public TensorVoidHook { + public: + explicit CppTensorVoidHook(std::function&& fn) : fn_(std::move(fn)) {} + + void operator()() override { return fn_(); } + + private: + std::function fn_; +}; +} // namespace egr diff --git a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc index 880bd26841027..28682ab0fe094 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/accumulation_node_test.cc @@ -23,6 +23,7 @@ #include "paddle/fluid/eager/grad_tensor_holder.h" #include "paddle/fluid/eager/utils.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/kernel_registry.h" @@ -116,7 +117,8 @@ TEST(AccumulationNode, Tensor) { VLOG(6) << "Running Reduce Hook"; }; - node->RegisterReduceHook(reduce_hook_1); + node->RegisterReduceHook( + std::make_shared(reduce_hook_1)); // operator() paddle::experimental::Tensor _ret = node->operator()({{et0}})[0][0]; @@ -141,7 +143,8 @@ TEST(AccumulationNode, Tensor) { ret_et0_ptr[0] = 100.0; // set to 100.0 VLOG(6) << "Running Reduce Hook"; }; - node->RegisterReduceHook(reduce_hook_2); + node->RegisterReduceHook( + std::make_shared(reduce_hook_2)); node->ApplyReduceHooks(); // Check ApplyReduceHooks result diff --git a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc index aee6ee7488671..e3db309c4016a 100644 --- a/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc +++ b/paddle/fluid/eager/tests/data_structure_tests/grad_node_info_test.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/eager/autograd_meta.h" #include "paddle/fluid/eager/eager_tensor.h" #include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/data_structure_tests/grad_node_test.h" #include "paddle/phi/api/lib/utils/allocator.h" @@ -32,7 +33,7 @@ TEST(GradNodeInfo, GradSlotMeta) { CHECK_EQ(grad_slot.Size(), 2); } -TEST(GradNodeInfo, GradNodeBase) { +void TestGradNodeBase(bool is_remove_gradient_hook) { VLOG(6) << "Construct Grad Node"; auto grad_test_node0 = std::make_shared( /* val */ 5.0, /* in_num */ 2, /* out_num */ 2); @@ -112,13 +113,25 @@ TEST(GradNodeInfo, GradNodeBase) { VLOG(6) << "Running Gradient Hook"; return res; }; - grad_test_node0->RegisterGradientHook(0, 0, gradient_hook); - // 5 + 6 + int64_t hook_id = grad_test_node0->RegisterGradientHook( + 0, 0, std::make_shared(gradient_hook)); + + if (is_remove_gradient_hook) { + // Remove GradientHook + grad_test_node0->RemoveGradientHook(hook_id); + } + + // Check results auto grad_hook_res = grad_test_node0->ApplyGradientHooks(grads); CHECK_EQ( std::dynamic_pointer_cast(grad_hook_res[0][0].impl()) ->data()[0], - 11.0); + is_remove_gradient_hook ? 5.0 : 11.0); +} + +TEST(GradNodeInfo, GradNodeBase) { + TestGradNodeBase(true); + TestGradNodeBase(false); } TEST(GradNodeInfo, Edge) { diff --git a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc index 752fd7812847c..5a7bafb2fe370 100644 --- a/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc +++ b/paddle/fluid/eager/tests/task_tests/fwd_bwd_joint_test.cc @@ -27,6 +27,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" namespace egr { @@ -221,10 +222,6 @@ TEST(FwdBwdJoint, GradientHook) { phi::DataLayout::NCHW, 5.0 /*value*/, true /*is_leaf*/); egr_utils_api::RetainGradForTensor(tensor); - std::function - hook = &hook_function; - // 3. Run Forward // Run Forward Node 0 float scale0 = 2.0; @@ -232,24 +229,27 @@ TEST(FwdBwdJoint, GradientHook) { paddle::experimental::Tensor out0 = egr::scale(tensor, scale0, bias0, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out0); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out0, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out0); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out0, std::make_shared(hook_function)); // hook: +5 // Run Forward Node 1 float scale1 = 5.0; float bias1 = 10.0; paddle::experimental::Tensor out1 = egr::scale( out0, scale1, bias1, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out1); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out1, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out1); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out1, std::make_shared(hook_function)); // hook: +5 // Run Forward Node 2 float scale2 = 10.0; float bias2 = 20.0; paddle::experimental::Tensor out2 = egr::scale( out0, scale2, bias2, true /*bias_after_scale*/, true /*trace_backward*/); - egr_utils_api::RetainGradForTensor(out2); // hook: +5 - egr_utils_api::RegisterGradientHookForTensor(out2, hook); // hook: +5 + egr_utils_api::RetainGradForTensor(out2); // hook: +5 + egr_utils_api::RegisterGradientHookForTensor( + out2, std::make_shared(hook_function)); // hook: +5 // 4. Run Backward std::vector outs = {out1, out2}; diff --git a/paddle/fluid/eager/tests/task_tests/hook_test.cc b/paddle/fluid/eager/tests/task_tests/hook_test.cc index fbc71168fe416..9cda961741f55 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test.cc @@ -28,6 +28,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/tensor_meta.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/tests/test_utils.h" namespace egr { @@ -83,9 +84,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { // Apply RetainGrad { // ScaleNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto_grad_meta->SetGradNode( @@ -96,7 +94,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + target_tensor, std::make_shared(hook_function)); egr_utils_api::RetainGradForTensor( target_tensor); // result: 1.0 + 3.0 = 4.0 egr_utils_api::RetainGradForTensor( @@ -107,9 +106,6 @@ TEST(RetainGrad, HookBeforeRetainGrad) { paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { // AccumulationNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); @@ -126,7 +122,8 @@ TEST(RetainGrad, HookBeforeRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + leaf_tensor, std::make_shared(hook_function)); egr_utils_api::RetainGradForTensor( leaf_tensor); // result: 4.0*5.0 + 3.0 = 23.0 } @@ -161,9 +158,6 @@ TEST(RetainGrad, HookAfterRetainGrad) { // Apply RetainGrad { // ScaleNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto_grad_meta->SetGradNode( @@ -175,16 +169,14 @@ TEST(RetainGrad, HookAfterRetainGrad) { auto_grad_meta)); egr_utils_api::RetainGradForTensor(target_tensor); // result: 1.0 - egr_utils_api::RegisterGradientHookForTensor(target_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + target_tensor, std::make_shared(hook_function)); } // Retain Grad for leaf tensor1 paddle::experimental::Tensor leaf_tensor = paddle::experimental::Tensor(); { // AccumulationNode Hook: +3 - std::function - hook = &hook_function; auto auto_grad_meta = std::make_shared(); auto acc_node_ptr = @@ -199,7 +191,8 @@ TEST(RetainGrad, HookAfterRetainGrad) { std::dynamic_pointer_cast( auto_grad_meta)); - egr_utils_api::RegisterGradientHookForTensor(leaf_tensor, hook); + egr_utils_api::RegisterGradientHookForTensor( + leaf_tensor, std::make_shared(hook_function)); } RunBackward(target_tensors, {}); diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc index dbcfe704dbe1c..15b2a62dca751 100644 --- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc +++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc @@ -24,6 +24,7 @@ #include "paddle/phi/core/dense_tensor.h" #include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/phi/core/kernel_registry.h" namespace egr { @@ -54,7 +55,7 @@ paddle::experimental::Tensor hook_function( return ret; } -TEST(Hook_intermidiate, Sigmoid) { +void test_sigmoid(bool is_remove_gradient_hook) { // Prepare Device Contexts VLOG(6) << "Init Env"; eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -67,11 +68,6 @@ TEST(Hook_intermidiate, Sigmoid) { ddim, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 0.0, true); - VLOG(6) << "Make Hook function"; - std::function - hook = &hook_function; - VLOG(6) << "Make ReduceHook function"; auto reduce_hook = [&](void) -> void { auto* t_ptr = std::dynamic_pointer_cast(tensor.impl()) @@ -85,10 +81,12 @@ TEST(Hook_intermidiate, Sigmoid) { egr_utils_api::RetainGradForTensor(tensor); VLOG(6) << "Register GradientHook for Tensor"; - egr_utils_api::RegisterGradientHookForTensor(tensor, hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + tensor, std::make_shared(hook_function)); VLOG(6) << "Register ReduceHook for Tensor"; - egr_utils_api::RegisterReduceHookForTensor(tensor, reduce_hook); + egr_utils_api::RegisterReduceHookForTensor( + tensor, std::make_shared(reduce_hook)); VLOG(6) << "Runing Forward"; auto output_tensor = sigmoid_dygraph_function(tensor, {}); @@ -98,11 +96,17 @@ TEST(Hook_intermidiate, Sigmoid) { std::vector target_tensors = {output_tensor}; + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(tensor); + grad_node_tmp->RemoveGradientHook(hook_id); + } + VLOG(6) << "Runing Backward"; RunBackward(target_tensors, {}); VLOG(6) << "Finish Backward"; - eager_test::CompareGradTensorWithValue(tensor, 0.25 + 3); + eager_test::CompareGradTensorWithValue( + tensor, is_remove_gradient_hook ? 0.25 : 0.25 + 3.0); VLOG(6) << "Checking ReduceHook results"; for (int i = 0; i < tensor.numel(); i++) { @@ -113,7 +117,7 @@ TEST(Hook_intermidiate, Sigmoid) { VLOG(6) << "After Tests"; } -TEST(Hook_intermidiate, ElementwiseAdd) { +void test_elementwiseAdd(bool is_remove_gradient_hook) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -132,11 +136,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) { ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 2.0, true); - std::function - hook = &hook_function; - - auto reduce_hook = [&](void) -> void { + auto reduce_hook = [&]() -> void { auto* t_ptr = std::dynamic_pointer_cast(Y.impl())->data(); for (int i = 0; i < Y.numel(); i++) { @@ -145,18 +145,26 @@ TEST(Hook_intermidiate, ElementwiseAdd) { }; egr_utils_api::RetainGradForTensor(Y); - egr_utils_api::RegisterGradientHookForTensor(Y, hook); - egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + Y, std::make_shared(hook_function)); + egr_utils_api::RegisterReduceHookForTensor( + Y, std::make_shared(reduce_hook)); auto output_tensor = elementwise_add_dygraph_function(X, Y, {}); eager_test::CompareTensorWithValue(output_tensor, 5); - std::vector target_tensors = {output_tensor}; + + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(Y); + grad_node_tmp->RemoveGradientHook(hook_id); + } + RunBackward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 1.0); - eager_test::CompareGradTensorWithValue(Y, 4.0); + eager_test::CompareGradTensorWithValue( + Y, is_remove_gradient_hook ? 1.0 : 1.0 + 3.0); // Checking ReduceHook results for (int i = 0; i < Y.numel(); i++) { @@ -166,7 +174,7 @@ TEST(Hook_intermidiate, ElementwiseAdd) { } } -TEST(Hook_intermidiate, Matmul_v2) { +void test_matmul(bool is_remove_gradient_hook) { // Prepare Device Contexts eager_test::InitEnv(paddle::platform::CPUPlace()); @@ -185,10 +193,6 @@ TEST(Hook_intermidiate, Matmul_v2) { ddimY, paddle::platform::CPUPlace(), phi::DataType::FLOAT32, phi::DataLayout::NCHW, 2.0, true); - std::function - hook = &hook_function; - auto reduce_hook = [&](void) -> void { auto* t_ptr = std::dynamic_pointer_cast(Y.impl())->data(); @@ -198,19 +202,27 @@ TEST(Hook_intermidiate, Matmul_v2) { }; egr_utils_api::RetainGradForTensor(Y); - egr_utils_api::RegisterGradientHookForTensor(Y, hook); - egr_utils_api::RegisterReduceHookForTensor(Y, reduce_hook); + int64_t hook_id = egr_utils_api::RegisterGradientHookForTensor( + Y, std::make_shared(hook_function)); + egr_utils_api::RegisterReduceHookForTensor( + Y, std::make_shared(reduce_hook)); auto output_tensor = matmul_v2_dygraph_function( X, Y, {{"trans_x", false}, {"trans_y", false}}); eager_test::CompareTensorWithValue(output_tensor, 96); - std::vector target_tensors = {output_tensor}; + + if (is_remove_gradient_hook) { + std::shared_ptr grad_node_tmp = EagerUtils::grad_node(Y); + grad_node_tmp->RemoveGradientHook(hook_id); + } + RunBackward(target_tensors, {}); eager_test::CompareGradTensorWithValue(X, 2.0 * 20); - eager_test::CompareGradTensorWithValue(Y, 3.0 * 4 + 3); + eager_test::CompareGradTensorWithValue( + Y, is_remove_gradient_hook ? 3.0 * 4 : 3.0 * 4 + 3); // Checking ReduceHook results for (int i = 0; i < Y.numel(); i++) { @@ -219,6 +231,22 @@ TEST(Hook_intermidiate, Matmul_v2) { static_cast(100.0f)); } } + +TEST(Hook_intermidiate, Sigmoid) { + // True or false represents whether to call RemoveGradientHook + test_sigmoid(true); + test_sigmoid(false); +} + +TEST(Hook_intermidiate, ElementwiseAdd) { + test_elementwiseAdd(true); + test_elementwiseAdd(false); +} + +TEST(Hook_intermidiate, Matmul_v2) { + test_matmul(true); + test_matmul(false); +} } // namespace egr USE_OP(sigmoid); diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 4e900ae2ffbc1..221d4d53d0663 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -20,6 +20,8 @@ limitations under the License. */ #include "paddle/fluid/eager/accumulation/accumulation_node.h" #include "paddle/fluid/eager/api/all.h" #include "paddle/fluid/eager/autograd_meta.h" +#include "paddle/fluid/eager/grad_node_info.h" +#include "paddle/fluid/eager/hooks.h" #include "paddle/fluid/eager/utils.h" #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/memory/allocation/allocator.h" @@ -35,6 +37,82 @@ limitations under the License. */ namespace paddle { namespace pybind { +namespace py = ::pybind11; + +class PyTensorHook : public egr::TensorHook { + public: + explicit PyTensorHook(PyObject* func) : py_func_(func) { + Py_INCREF(py_func_); + } + + ~PyTensorHook() { + py::gil_scoped_acquire gil; + Py_DECREF(py_func_); + } + + paddle::experimental::Tensor operator()( + const paddle::experimental::Tensor& var) override { + py::gil_scoped_acquire gil; + VLOG(3) << "Call PyTensorHook for var " << var.name(); + + PyObject* res = nullptr; + try { + res = PyObject_CallFunctionObjArgs(py_func_, ToPyObject(var), nullptr); + } catch (platform::EnforceNotMet& e) { + throw std::move(e); + } catch (std::exception& e) { + PADDLE_THROW(platform::errors::Unavailable( + "Hook function of Tensor raises an exception: %s.", e.what())); + } catch (...) { + PADDLE_THROW(platform::errors::Fatal( + "Hook function of Tensor raises an unknown exception.")); + } + + PADDLE_ENFORCE_NOT_NULL(res, + platform::errors::Unavailable( + "Hook function of Tensor return a nullptr.")); + if (res == Py_None) { + return var; + } + return reinterpret_cast(res)->tensor; + } + + private: + PyObject* py_func_; +}; + +class PyTensorVoidHook : public egr::TensorVoidHook { + public: + explicit PyTensorVoidHook(PyObject* func) : py_func_(func) { + Py_INCREF(py_func_); + } + + ~PyTensorVoidHook() { + py::gil_scoped_acquire gil; + Py_DECREF(py_func_); + } + + void operator()() override { + py::gil_scoped_acquire gil; + VLOG(3) << "Call PyTensorVoidHook"; + + try { + PyObject_CallFunctionObjArgs(py_func_, nullptr); + } catch (platform::EnforceNotMet& e) { + throw std::move(e); + } catch (std::exception& e) { + PADDLE_THROW(platform::errors::Unavailable( + "Hook function of Tensor raises an exception: %s.", e.what())); + } catch (...) { + PADDLE_THROW(platform::errors::Fatal( + "Hook function of Tensor raises an unknown exception.")); + } + } + + private: + PyObject* py_func_; +}; + extern void InitTensorWithNumpyValue(TensorObject* self, const pybind11::object& array, bool zero_copy); @@ -403,6 +481,92 @@ static PyObject* tensor_method_set_value(TensorObject* self, PyObject* args, EAGER_CATCH_AND_THROW_RETURN_NULL } +static PyObject* tensor_register_grad_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + int64_t hook_id; + if (egr::egr_utils_api::IsLeafTensor(self->tensor)) { + VLOG(6) << "Register hook for leaf tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation.")); + auto rank_info = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo(); + + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + auto accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + hook_id = accumulation_grad_node->RegisterGradientHook( + rank_info.first, rank_info.second, + std::make_shared(hook_func)); + + } else { + VLOG(6) << "Register hook for non leaf tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + auto rank_info = + egr::EagerUtils::unsafe_autograd_meta(self->tensor)->OutRankInfo(); + + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + hook_id = grad_node->RegisterGradientHook( + rank_info.first, rank_info.second, + std::make_shared(hook_func)); + } + return ToPyObject(hook_id); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_remove_grad_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + VLOG(6) << "Remove the registered hook for tensor: " << self->tensor.name(); + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + + int64_t hook_id = pybind::CastPyArg2AttrLong(PyTuple_GET_ITEM(args, 0), 0); + + return ToPyObject(grad_node->RemoveGradientHook(hook_id)); + EAGER_CATCH_AND_THROW_RETURN_NULL +} + +static PyObject* tensor_register_reduce_hook(TensorObject* self, PyObject* args, + PyObject* kwargs) { + EAGER_TRY + VLOG(4) << "Register reduce hook for tensor: " << self->tensor.name(); + + std::shared_ptr grad_node = + egr::EagerUtils::grad_node(self->tensor); + PADDLE_ENFORCE_EQ(egr::egr_utils_api::IsLeafTensor(self->tensor), true, + platform::errors::InvalidArgument( + "Only can register backward hook for leaf Tensor.")); + PADDLE_ENFORCE_EQ( + !egr::EagerUtils::unsafe_autograd_meta(self->tensor)->StopGradient(), + true, platform::errors::InvalidArgument( + "Cannot register backward hook on a Tensor that stop " + "gradient.")); + PADDLE_ENFORCE( + grad_node.get() != nullptr, + paddle::platform::errors::Fatal("Detected NULL grad_node," + "Leaf tensor should have had grad_node " + "with type: GradNodeAccumulation.")); + PyObject* hook_func = PyTuple_GET_ITEM(args, 0); + + auto accumulation_grad_node = + std::dynamic_pointer_cast(grad_node); + accumulation_grad_node->RegisterReduceHook( + std::make_shared(hook_func)); + + Py_INCREF(Py_None); + return Py_None; + EAGER_CATCH_AND_THROW_RETURN_NULL +} + PyMethodDef variable_methods[] = { {"numpy", (PyCFunction)(void (*)(void))tensor_method_numpy, METH_VARARGS | METH_KEYWORDS, NULL}, @@ -440,6 +604,14 @@ PyMethodDef variable_methods[] = { METH_VARARGS | METH_KEYWORDS, NULL}, {"_set_value", (PyCFunction)(void (*)(void))tensor_method_set_value, METH_VARARGS | METH_KEYWORDS, NULL}, + {"_register_grad_hook", + (PyCFunction)(void (*)(void))tensor_register_grad_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_remove_grad_hook", (PyCFunction)(void (*)(void))tensor_remove_grad_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, + {"_register_backward_hook", + (PyCFunction)(void (*)(void))tensor_register_reduce_hook, + METH_VARARGS | METH_KEYWORDS, NULL}, {NULL, NULL, 0, NULL}}; } // namespace pybind diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py index c4ea751ed92f8..65bfba3f6c32e 100644 --- a/python/paddle/fluid/dygraph/varbase_patch_methods.py +++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py @@ -33,10 +33,11 @@ class TensorHookRemoveHelper(object): """ A helper class that for removing Tensor gradient's hook. + NOTE(wuweilong):the operation weakref.ref(tensor) will cause some unexpected errors in eager mode. """ def __init__(self, tensor, hook_id): - self._tensor_ref = weakref.ref(tensor) + self._tensor = tensor if core._in_eager_mode() else weakref.ref(tensor) self._hook_id = hook_id def remove(self): @@ -46,7 +47,7 @@ def remove(self): Returns: bool: Return True if removed successfully """ - tensor = self._tensor_ref() + tensor = self._tensor if core._in_eager_mode() else self._tensor() if tensor is not None: res = tensor._remove_grad_hook(self._hook_id) if res is True: diff --git a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py index 52256766fed75..3238876b89414 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_register_hook.py @@ -19,6 +19,7 @@ import paddle import paddle.nn as nn +from paddle.fluid.framework import _test_eager_guard, _in_eager_mode class SimpleNet(nn.Layer): @@ -64,7 +65,7 @@ def setUp(self): if paddle.is_compiled_with_cuda(): self.devices.append("gpu") - def test_hook_for_interior_var(self): + def func_hook_for_interior_var(self): def run_double_hook_for_interior_var(double_hook, removed=False): for device in self.devices: paddle.set_device(device) @@ -154,7 +155,12 @@ def print_hook(grad): # register hook and removed run_print_hook_for_interior_var(print_hook, removed=True) - def test_hook_for_leaf_var(self): + def test_hook_for_interior_var(self): + with _test_eager_guard(): + self.func_hook_for_interior_var() + self.func_hook_for_interior_var() + + def func_hook_for_leaf_var(self): def run_double_hook_for_leaf_var(double_hook, removed=False): for device in self.devices: paddle.set_device(device) @@ -193,7 +199,12 @@ def run_double_hook_for_leaf_var(double_hook, removed=False): # register hook and removed run_double_hook_for_leaf_var(lambda grad: grad * 2, removed=True) - def test_hook_for_accumulated_grad_interior_var(self): + def test_hook_for_leaf_var(self): + with _test_eager_guard(): + self.func_hook_for_leaf_var() + self.func_hook_for_leaf_var() + + def func_hook_for_accumulated_grad_interior_var(self): def run_double_hook_for_accumulated_grad_interior_var(double_hook, removed=False): for device in self.devices: @@ -248,7 +259,12 @@ def run_double_hook_for_accumulated_grad_interior_var(double_hook, run_double_hook_for_accumulated_grad_interior_var( lambda grad: grad * 2, removed=True) - def test_hook_for_accumulated_grad_leaf_var(self): + def test_hook_for_accumulated_grad_interior_var(self): + with _test_eager_guard(): + self.func_hook_for_accumulated_grad_interior_var() + self.func_hook_for_accumulated_grad_interior_var() + + def func_hook_for_accumulated_grad_leaf_var(self): def run_double_hook_for_accumulated_grad_leaf_var(double_hook, removed=False): for device in self.devices: @@ -289,7 +305,12 @@ def run_double_hook_for_accumulated_grad_leaf_var(double_hook, run_double_hook_for_accumulated_grad_leaf_var( lambda grad: grad * 2, removed=True) - def test_hook_in_model(self): + def test_hook_for_accumulated_grad_leaf_var(self): + with _test_eager_guard(): + self.func_hook_for_accumulated_grad_leaf_var() + self.func_hook_for_accumulated_grad_leaf_var() + + def func_hook_in_model(self): def run_double_hook_in_model(data, label, hook=None, @@ -336,7 +357,12 @@ def run_double_hook_in_model(data, self.assertTrue(np.array_equal(linear1_w_grad, linear1_w_grad_rm)) self.assertTrue(np.array_equal(linear1_b_grad, linear1_b_grad_rm)) - def test_multiple_hooks_for_interior_var(self): + def test_func_hook_in_model(self): + with _test_eager_guard(): + self.func_hook_in_model() + self.func_hook_in_model() + + def func_multiple_hooks_for_interior_var(self): def run_multiple_hooks_for_interior_var(device, hooks, remove1=False, @@ -414,6 +440,12 @@ def double_hook(grad): self.assertTrue(np.array_equal(x_grad, z)) self.assertTrue(np.array_equal(y_grad, z)) + def test_multiple_hooks_for_interior_var(self): + with _test_eager_guard(): + self.func_multiple_hooks_for_interior_var() + self.func_multiple_hooks_for_interior_var() + + # TODO(wuweilong): enable this case when DoubleGrad in eager mode is ready def test_hook_in_double_grad(self): def double_print_hook(grad): grad = grad * 2 @@ -446,7 +478,7 @@ def double_print_hook(grad): z.backward() self.assertTrue(np.array_equal(x.grad.numpy(), np.array([8.]))) - def test_remove_one_hook_multiple_times(self): + def func_remove_one_hook_multiple_times(self): for device in self.devices: paddle.set_device(device) @@ -457,7 +489,12 @@ def test_remove_one_hook_multiple_times(self): self.assertTrue(h.remove()) self.assertFalse(h.remove()) - def test_register_hook_for_stop_gradient_var(self): + def test_remove_one_hook_multiple_times(self): + with _test_eager_guard(): + self.func_remove_one_hook_multiple_times() + self.func_remove_one_hook_multiple_times() + + def func_register_hook_for_stop_gradient_var(self): for device in self.devices: paddle.set_device(device) @@ -466,6 +503,11 @@ def test_register_hook_for_stop_gradient_var(self): with self.assertRaises(RuntimeError): x.register_hook(lambda grad: grad * 2) + def test_register_hook_for_stop_gradient_var(self): + with _test_eager_guard(): + self.func_register_hook_for_stop_gradient_var() + self.func_register_hook_for_stop_gradient_var() + def test_register_hook_in_static_mode(self): paddle.enable_static() @@ -482,7 +524,7 @@ def test_register_hook_in_static_mode(self): paddle.disable_static() - def test_register_hook_in_dy2static_mode(self): + def func_register_hook_in_dy2static_mode(self): net = SimpleNetForStatic(self.in_size, self.out_size) jit_net = paddle.jit.to_static( net, input_spec=[paddle.static.InputSpec([None, self.in_size])]) @@ -491,8 +533,17 @@ def test_register_hook_in_dy2static_mode(self): size=[self.batch_size, self.in_size]).astype('float32') data_t = paddle.to_tensor(data) - with self.assertRaises(AssertionError): - out = jit_net(data_t) + if _in_eager_mode(): + with self.assertRaises(TypeError): + out = jit_net(data_t) + else: + with self.assertRaises(AssertionError): + out = jit_net(data_t) + + def test_register_hook_in_dy2static_mode(self): + with _test_eager_guard(): + self.func_register_hook_in_dy2static_mode() + self.func_register_hook_in_dy2static_mode() HOOK_INIT_VALUE = 10 @@ -512,7 +563,7 @@ def setUp(self): if paddle.is_compiled_with_cuda(): self.devices.append("gpu") - def test_register_backward_hook(self): + def func_register_backward_hook(self): global HOOK_INIT_VALUE global HOOK_IS_CALLED for device in self.devices: @@ -529,20 +580,35 @@ def test_register_backward_hook(self): HOOK_INIT_VALUE = 10 HOOK_IS_CALLED = False - def test_register_backward_hook_for_interior_var(self): + def test_register_backward_hook(self): + with _test_eager_guard(): + self.func_register_backward_hook() + self.func_register_backward_hook() + + def func_register_backward_hook_for_interior_var(self): x = paddle.to_tensor(5., stop_gradient=False) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): y._register_backward_hook(global_void_hook) - def test_register_backward_hook_for_var_without_gradient(self): + def test_register_backward_hook_for_interior_var(self): + with _test_eager_guard(): + self.func_register_backward_hook_for_interior_var() + self.func_register_backward_hook_for_interior_var() + + def func_register_backward_hook_for_var_without_gradient(self): x = paddle.to_tensor(5.) y = paddle.pow(x, 4.0) with self.assertRaises(ValueError): x._register_backward_hook(global_void_hook) + def test_register_backward_hook_for_var_without_gradient(self): + with _test_eager_guard(): + self.func_register_backward_hook_for_var_without_gradient() + self.func_register_backward_hook_for_var_without_gradient() + if __name__ == '__main__': unittest.main() From de8f27485ac54cbfe1bb3bd59d2cc68c2e20c335 Mon Sep 17 00:00:00 2001 From: From00 Date: Sat, 26 Feb 2022 11:02:47 +0800 Subject: [PATCH 67/85] Move BilinearTensorProduct OP to phi (#39903) * Move BilinearTensorProduct OP to phi * Set dtype for Infermeta --- .../operators/bilinear_tensor_product_op.cc | 154 ++------------- .../operators/bilinear_tensor_product_op.cu | 29 --- .../operators/bilinear_tensor_product_op.h | 181 ------------------ paddle/phi/infermeta/backward.cc | 48 +++++ paddle/phi/infermeta/backward.h | 9 + paddle/phi/infermeta/multiary.cc | 66 +++++++ paddle/phi/infermeta/multiary.h | 7 + .../bilinear_tensor_product_grad_kernel.h | 32 ++++ .../kernels/bilinear_tensor_product_kernel.h | 30 +++ .../bilinear_tensor_product_grad_kernel.cc | 25 +++ .../cpu/bilinear_tensor_product_kernel.cc | 25 +++ .../bilinear_tensor_product_grad_kernel.cu | 25 +++ .../gpu/bilinear_tensor_product_kernel.cu | 25 +++ ...bilinear_tensor_product_grad_kernel_impl.h | 144 ++++++++++++++ .../bilinear_tensor_product_kernel_impl.h | 75 ++++++++ .../ops/compat/bilinear_tensor_product_sig.cc | 41 ++++ 16 files changed, 569 insertions(+), 347 deletions(-) delete mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.cu delete mode 100644 paddle/fluid/operators/bilinear_tensor_product_op.h create mode 100644 paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h create mode 100644 paddle/phi/kernels/bilinear_tensor_product_kernel.h create mode 100644 paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc create mode 100644 paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu create mode 100644 paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h create mode 100644 paddle/phi/ops/compat/bilinear_tensor_product_sig.cc diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cc b/paddle/fluid/operators/bilinear_tensor_product_op.cc index 253a96004bd30..4774c0a1dbc3b 100644 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cc +++ b/paddle/fluid/operators/bilinear_tensor_product_op.cc @@ -12,84 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/bilinear_tensor_product_op.h" -#include -#include -#include +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/multiary.h" namespace paddle { namespace operators { -using framework::Tensor; - class BilinearTensorProductOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Y"), true, - platform::errors::InvalidArgument("Input(Y) should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Weight"), true, - platform::errors::InvalidArgument("Input(Weight) should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasOutput("Out"), true, - platform::errors::InvalidArgument("Output(Out) should not be null.")); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto weight_dims = ctx->GetInputDim("Weight"); - - PADDLE_ENFORCE_EQ( - x_dims.size(), 2UL, - platform::errors::InvalidArgument("The input(X) must be a 2D Tensor.")); - PADDLE_ENFORCE_EQ( - y_dims.size(), 2UL, - platform::errors::InvalidArgument("The input(Y) must be a 2D Tensor.")); - PADDLE_ENFORCE_EQ( - weight_dims.size(), 3UL, - platform::errors::InvalidArgument("Expected the input(Weight) is a 3D " - "tensor. But received %dD tensor.", - weight_dims.size())); - if (ctx->IsRuntime() || (x_dims[0] > 0 && y_dims[0] > 0)) { - PADDLE_ENFORCE_EQ( - x_dims[0], y_dims[0], - platform::errors::InvalidArgument( - "The first dimension(batch_size) of input(X) must be " - "equal to the first dimension of the input(Y).")); - } - PADDLE_ENFORCE_EQ(x_dims[1], weight_dims[1], - platform::errors::InvalidArgument( - "The second dimension of input(X) must be equal to " - "the second dimension of the input(Weight).")); - PADDLE_ENFORCE_EQ(y_dims[1], weight_dims[2], - platform::errors::InvalidArgument( - "The second dimension of input(Y) must be equal to " - "the third dimension of the input(Weight).")); - - if (ctx->HasInput("Bias")) { - auto bias_dims = ctx->GetInputDim("Bias"); - PADDLE_ENFORCE_EQ(bias_dims.size(), 2UL, - platform::errors::InvalidArgument( - "The Input(Bias) must be a 2-D tensor with " - "the 2nd dimension fixed to 1 (a row vector).")); - PADDLE_ENFORCE_EQ(bias_dims[0], 1UL, - platform::errors::InvalidArgument( - "The Input(Bias) must be a 2-D tensor with " - "the 2nd dimension fixed to 1 (a row vector).")); - PADDLE_ENFORCE_EQ(bias_dims[1], weight_dims[0], - platform::errors::InvalidArgument( - "The second dimension of input(Bias) must be equal " - "to the first dimension of the input(Weight).")); - } - - ctx->SetOutputDim("Out", {x_dims[0], weight_dims[0]}); - ctx->ShareLoD("X", /*->*/ "Out"); - } }; class BilinearTensorProductOpMaker : public framework::OpProtoAndCheckerMaker { @@ -125,59 +59,6 @@ Where $W_i$ is the $i$-th slice of Input(Weight); class BilinearTensorProductOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE_EQ( - ctx->HasInput("X"), true, - platform::errors::InvalidArgument("Input(X) should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Y"), true, - platform::errors::InvalidArgument("Input(Y) should not be null.")); - PADDLE_ENFORCE_EQ( - ctx->HasInput("Weight"), true, - platform::errors::InvalidArgument("Input(Weight) should not be null.")); - PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, - platform::errors::InvalidArgument( - "Input(Out@GRAD) should not be null.")); - auto x_dims = ctx->GetInputDim("X"); - auto y_dims = ctx->GetInputDim("Y"); - auto weight_dims = ctx->GetInputDim("Weight"); - auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); - - PADDLE_ENFORCE_EQ(out_dims.size(), 2UL, - platform::errors::InvalidArgument( - "The input(Out@GRAD) must be a 2D Tensor.")); - PADDLE_ENFORCE_EQ( - x_dims[0], out_dims[0], - platform::errors::InvalidArgument( - "The first dimension(batch_size) of input(Out@GRAD) must be " - "equal to the first dimension of the Input(X).")); - PADDLE_ENFORCE_EQ( - weight_dims[0], out_dims[1], - platform::errors::InvalidArgument( - "The second dimension of input(Out@GRAD) must be equal to " - "the third dimension of the Input(Weight).")); - - auto bias_grad_name = framework::GradVarName("Bias"); - if (ctx->HasOutput(bias_grad_name)) { - ctx->SetOutputDim(bias_grad_name, {1, out_dims[1]}); - } - - auto x_grad_name = framework::GradVarName("X"); - auto y_grad_name = framework::GradVarName("Y"); - auto weight_grad_name = framework::GradVarName("Weight"); - - if (ctx->HasOutput(x_grad_name)) { - ctx->SetOutputDim(x_grad_name, x_dims); - } - if (ctx->HasOutput(y_grad_name)) { - ctx->SetOutputDim(y_grad_name, y_dims); - } - if (ctx->HasOutput(weight_grad_name)) { - ctx->SetOutputDim(weight_grad_name, weight_dims); - } - } }; template @@ -208,21 +89,20 @@ class BilinearTensorProductGradOpMaker } // namespace paddle namespace ops = paddle::operators; + +DELCARE_INFER_SHAPE_FUNCTOR(bilinear_tensor_product, + BilinearTensorProductInferShapeFunctor, + PT_INFER_META(phi::BilinearTensorProductInferMeta)); +DELCARE_INFER_SHAPE_FUNCTOR( + bilinear_tensor_product_grad, BilinearTensorProductGradInferShapeFunctor, + PT_INFER_META(phi::BilinearTensorProductGradInferMeta)); + REGISTER_OPERATOR( bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpMaker, ops::BilinearTensorProductGradOpMaker, - ops::BilinearTensorProductGradOpMaker); + ops::BilinearTensorProductGradOpMaker, + BilinearTensorProductInferShapeFunctor); REGISTER_OPERATOR(bilinear_tensor_product_grad, - ops::BilinearTensorProductOpGrad); -REGISTER_OP_CPU_KERNEL( - bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); -REGISTER_OP_CPU_KERNEL( - bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductOpGrad, + BilinearTensorProductGradInferShapeFunctor); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.cu b/paddle/fluid/operators/bilinear_tensor_product_op.cu deleted file mode 100644 index c2b4f69e68545..0000000000000 --- a/paddle/fluid/operators/bilinear_tensor_product_op.cu +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/bilinear_tensor_product_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); -REGISTER_OP_CUDA_KERNEL( - bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h deleted file mode 100644 index 2dbe3a132d78a..0000000000000 --- a/paddle/fluid/operators/bilinear_tensor_product_op.h +++ /dev/null @@ -1,181 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/fluid/framework/eigen.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/phi/kernels/funcs/blas/blas.h" - -namespace paddle { -namespace operators { - -using framework::Tensor; - -template -using EigenMatrix = framework::EigenMatrix; - -template -class BilinearTensorProductKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* weight = ctx.Input("Weight"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); - - auto y_mat = EigenMatrix::From(*y); - auto output_mat = EigenMatrix::From(*out); - - auto batch_size = x->dims()[0]; - auto weight_dims = weight->dims(); - int out_dim = weight_dims[0]; - auto x_dim = weight_dims[1]; - auto y_dim = weight_dims[2]; - auto& place = *ctx.template device_context().eigen_device(); - auto& dev_ctx = ctx.template device_context(); - - // Create the intermediate variable to calculate the result of - // Input(X) multiplied by Input(Weight_i), the formula is: - // left_mul = X Weight_i. - Tensor left_mul; - left_mul.mutable_data(phi::make_ddim({batch_size, y_dim}), - ctx.GetPlace()); - auto left_mul_mat = EigenMatrix::From(left_mul); - - for (int i = 0; i < out_dim; ++i) { - auto output_col_vec = output_mat.chip(i, 1); - Tensor weight_mat = - weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim})); - phi::funcs::GetBlas(dev_ctx).GEMM( - CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data(), - weight_mat.data(), 0, left_mul.data()); - output_col_vec.device(place) = - (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); - } - if (bias) { - auto bias_vec = EigenMatrix::From(*bias); - Eigen::DSizes bcast(batch_size, 1); - output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; - } - } -}; - -template -class BilinearTensorProductGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - const Tensor* y = ctx.Input("Y"); - const Tensor* weight = ctx.Input("Weight"); - Tensor* d_x = ctx.Output(framework::GradVarName("X")); - Tensor* d_y = ctx.Output(framework::GradVarName("Y")); - Tensor* d_weight = ctx.Output(framework::GradVarName("Weight")); - Tensor* d_bias = ctx.Output(framework::GradVarName("Bias")); - const Tensor* d_out = ctx.Input(framework::GradVarName("Out")); - - auto batch_size = x->dims()[0]; - auto weight_dims = weight->dims(); - int out_dim = weight_dims[0]; - auto x_dim = weight_dims[1]; - auto y_dim = weight_dims[2]; - - auto x_mat = EigenMatrix::From(*x); - auto y_mat = EigenMatrix::From(*y); - auto d_out_mat = EigenMatrix::From(*d_out); - auto& place = *ctx.template device_context().eigen_device(); - auto& dev_ctx = ctx.template device_context(); - // Create the intermediate variable to calculate the Output(Y@Grad). - Tensor x_scale; - x_scale.mutable_data(phi::make_ddim({batch_size, x_dim}), - ctx.GetPlace()); - auto x_scale_mat = EigenMatrix::From(x_scale); - - // Create the intermediate variable to calculate the Output(X@Grad). - Tensor y_scale; - y_scale.mutable_data(phi::make_ddim({batch_size, y_dim}), - ctx.GetPlace()); - auto y_scale_mat = EigenMatrix::From(y_scale); - - phi::funcs::SetConstant set_zero; - - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_x, static_cast(0)); - } - - if (d_y) { - d_y->mutable_data(ctx.GetPlace()); - set_zero(dev_ctx, d_y, static_cast(0)); - } - - if (d_weight) { - d_weight->mutable_data(ctx.GetPlace()); - } - - auto blas = phi::funcs::GetBlas(ctx); - - // Caculate the Output(X@Grad) and Output(Y@Grad). - if (d_x || d_y || d_weight) { - Eigen::DSizes bcast_for_x(1, y_dim); - Eigen::DSizes bcast_for_y(1, x_dim); - Eigen::DSizes bcast_for_weight(1, x_dim); - - for (int i = 0; i < out_dim; ++i) { - Tensor weight_i = - weight->Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim})); - auto output_vec = d_out_mat.chip(i, 1); - - if (d_x) { - y_scale_mat.device(place) = - output_vec.reshape(Eigen::DSizes(batch_size, 1)) - .broadcast(bcast_for_x) * - y_mat; - blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, - y_scale.data(), weight_i.data(), 1, d_x->data()); - } - - if (d_y || d_weight) { - auto output_vec_y = - output_vec.reshape(Eigen::DSizes(batch_size, 1)) - .broadcast(bcast_for_y); - x_scale_mat.device(place) = output_vec_y * x_mat; - if (d_y) { - blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, - x_scale.data(), weight_i.data(), 1, d_y->data()); - } - if (d_weight) { - Tensor d_weight_i = d_weight->Slice(i, i + 1).Resize( - phi::make_ddim({x_dim, y_dim})); - blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1, - x_scale.data(), y->data(), 0, d_weight_i.data()); - } - } - } - } - - // calculate the gradient of Input(Bias). - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_mat = framework::EigenVector::Flatten(*d_bias); - d_bias_mat.device(place) = d_out_mat.sum(Eigen::DSizes(0)); - } - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index c4ae2e0b371c1..e08eae0fc68f4 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -16,6 +16,54 @@ limitations under the License. */ namespace phi { +void BilinearTensorProductGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + const MetaTensor& dout, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dweight, + MetaTensor* dbias) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto weight_dims = weight.dims(); + auto out_dims = dout.dims(); + + PADDLE_ENFORCE_EQ( + out_dims.size(), + 2UL, + errors::InvalidArgument("The input(Out@GRAD) must be a 2D Tensor.")); + PADDLE_ENFORCE_EQ( + x_dims[0], + out_dims[0], + errors::InvalidArgument( + "The first dimension(batch_size) of input(Out@GRAD) must be " + "equal to the first dimension of the Input(X).")); + PADDLE_ENFORCE_EQ( + weight_dims[0], + out_dims[1], + errors::InvalidArgument( + "The second dimension of input(Out@GRAD) must be equal to " + "the third dimension of the Input(Weight).")); + + if (dx) { + dx->set_dims(x_dims); + dx->set_dtype(x.dtype()); + } + if (dy) { + dy->set_dims(y_dims); + dy->set_dtype(y.dtype()); + } + if (dweight) { + dweight->set_dims(weight_dims); + dweight->set_dtype(weight.dtype()); + } + if (dbias) { + dbias->set_dims({1, out_dims[1]}); + dbias->set_dtype(dout.dtype()); + } +} + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 965c380db25ec..35f988bbc0b85 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -20,6 +20,15 @@ limitations under the License. */ namespace phi { +void BilinearTensorProductGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + const MetaTensor& dout, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dweight, + MetaTensor* dbias); + void GeneralBinaryGradInferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* dx, diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc index d72033f952857..7a0db3d5c17ee 100644 --- a/paddle/phi/infermeta/multiary.cc +++ b/paddle/phi/infermeta/multiary.cc @@ -18,6 +18,72 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/concat_funcs.h" namespace phi { +void BilinearTensorProductInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + paddle::optional bias, + MetaTensor* out, + MetaConfig config) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + auto weight_dims = weight.dims(); + + PADDLE_ENFORCE_EQ( + x_dims.size(), + 2UL, + errors::InvalidArgument("The input(X) must be a 2D Tensor.")); + PADDLE_ENFORCE_EQ( + y_dims.size(), + 2UL, + errors::InvalidArgument("The input(Y) must be a 2D Tensor.")); + PADDLE_ENFORCE_EQ( + weight_dims.size(), + 3UL, + errors::InvalidArgument( + "Expected the input(Weight) is a 3D tensor. But received %dD tensor.", + weight_dims.size())); + if (config.is_runtime || (x_dims[0] > 0 && y_dims[0] > 0)) { + PADDLE_ENFORCE_EQ(x_dims[0], + y_dims[0], + errors::InvalidArgument( + "The first dimension(batch_size) of input(X) must be " + "equal to the first dimension of the input(Y).")); + } + PADDLE_ENFORCE_EQ(x_dims[1], + weight_dims[1], + errors::InvalidArgument( + "The second dimension of input(X) must be equal to " + "the second dimension of the input(Weight).")); + PADDLE_ENFORCE_EQ(y_dims[1], + weight_dims[2], + errors::InvalidArgument( + "The second dimension of input(Y) must be equal to " + "the third dimension of the input(Weight).")); + + if (bias.get_ptr()) { + auto bias_dims = bias->dims(); + PADDLE_ENFORCE_EQ(bias_dims.size(), + 2UL, + errors::InvalidArgument( + "The Input(Bias) must be a 2-D tensor with " + "the 2nd dimension fixed to 1 (a row vector).")); + PADDLE_ENFORCE_EQ(bias_dims[0], + 1UL, + errors::InvalidArgument( + "The Input(Bias) must be a 2-D tensor with " + "the 2nd dimension fixed to 1 (a row vector).")); + PADDLE_ENFORCE_EQ(bias_dims[1], + weight_dims[0], + errors::InvalidArgument( + "The second dimension of input(Bias) must be equal " + "to the first dimension of the input(Weight).")); + } + + out->set_dims({x_dims[0], weight_dims[0]}); + out->share_lod(x); + out->set_dtype(x.dtype()); +} + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h index 589fc33333d0c..a5fb2a4cbddc3 100644 --- a/paddle/phi/infermeta/multiary.h +++ b/paddle/phi/infermeta/multiary.h @@ -18,6 +18,13 @@ limitations under the License. */ #include "paddle/phi/core/meta_tensor.h" namespace phi { +void BilinearTensorProductInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& weight, + paddle::optional bias, + MetaTensor* out, + MetaConfig config = MetaConfig()); + void ConcatInferMeta(const std::vector& x, const Scalar& axis_scalar, MetaTensor* out, diff --git a/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h new file mode 100644 index 0000000000000..499aa1e0b2ea9 --- /dev/null +++ b/paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h @@ -0,0 +1,32 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BilinearTensorProductGradKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + DenseTensor* dweight, + DenseTensor* dbias); + +} // namespace phi diff --git a/paddle/phi/kernels/bilinear_tensor_product_kernel.h b/paddle/phi/kernels/bilinear_tensor_product_kernel.h new file mode 100644 index 0000000000000..b34e8946ddd58 --- /dev/null +++ b/paddle/phi/kernels/bilinear_tensor_product_kernel.h @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void BilinearTensorProductKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + paddle::optional bias, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc new file mode 100644 index 0000000000000..2268212316af6 --- /dev/null +++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_grad_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(bilinear_tensor_product_grad, + CPU, + ALL_LAYOUT, + phi::BilinearTensorProductGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc new file mode 100644 index 0000000000000..25bc5913865a0 --- /dev/null +++ b/paddle/phi/kernels/cpu/bilinear_tensor_product_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(bilinear_tensor_product, + CPU, + ALL_LAYOUT, + phi::BilinearTensorProductKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu new file mode 100644 index 0000000000000..f4f69ee83eea1 --- /dev/null +++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bilinear_tensor_product_grad_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(bilinear_tensor_product_grad, + GPU, + ALL_LAYOUT, + phi::BilinearTensorProductGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu new file mode 100644 index 0000000000000..b81b842cedba2 --- /dev/null +++ b/paddle/phi/kernels/gpu/bilinear_tensor_product_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/bilinear_tensor_product_kernel.h" +#include "paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(bilinear_tensor_product, + GPU, + ALL_LAYOUT, + phi::BilinearTensorProductKernel, + float, + double) {} diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h new file mode 100644 index 0000000000000..c199833b42a99 --- /dev/null +++ b/paddle/phi/kernels/impl/bilinear_tensor_product_grad_kernel_impl.h @@ -0,0 +1,144 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +void BilinearTensorProductGradKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + const DenseTensor& dout, + DenseTensor* dx, + DenseTensor* dy, + DenseTensor* dweight, + DenseTensor* dbias) { + auto batch_size = x.dims()[0]; + auto weight_dims = weight.dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + + auto x_mat = EigenMatrix::From(x); + auto y_mat = EigenMatrix::From(y); + auto dout_mat = EigenMatrix::From(dout); + auto& place = *ctx.eigen_device(); + // Create the intermediate variable to calculate the Output(Y@Grad). + DenseTensor x_scale; + x_scale.Resize(make_ddim({batch_size, x_dim})); + ctx.template Alloc(&x_scale); + auto x_scale_mat = EigenMatrix::From(x_scale); + + // Create the intermediate variable to calculate the Output(X@Grad). + DenseTensor y_scale; + y_scale.Resize(make_ddim({batch_size, y_dim})); + ctx.template Alloc(&y_scale); + auto y_scale_mat = EigenMatrix::From(y_scale); + + funcs::SetConstant set_zero; + + if (dx) { + ctx.template Alloc(dx); + set_zero(ctx, dx, static_cast(0)); + } + + if (dy) { + ctx.template Alloc(dy); + set_zero(ctx, dy, static_cast(0)); + } + + if (dweight) { + ctx.template Alloc(dweight); + } + + auto blas = funcs::GetBlas(ctx); + + // Caculate the Output(X@Grad) and Output(Y@Grad). + if (dx || dy || dweight) { + Eigen::DSizes bcast_for_x(1, y_dim); + Eigen::DSizes bcast_for_y(1, x_dim); + Eigen::DSizes bcast_for_weight(1, x_dim); + + for (int i = 0; i < out_dim; ++i) { + DenseTensor weight_i = + weight.Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim})); + auto output_vec = dout_mat.chip(i, 1); + + if (dx) { + y_scale_mat.device(place) = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_x) * + y_mat; + blas.GEMM(CblasNoTrans, + CblasTrans, + batch_size, + x_dim, + y_dim, + 1, + y_scale.data(), + weight_i.data(), + 1, + dx->data()); + } + + if (dy || dweight) { + auto output_vec_y = + output_vec.reshape(Eigen::DSizes(batch_size, 1)) + .broadcast(bcast_for_y); + x_scale_mat.device(place) = output_vec_y * x_mat; + if (dy) { + blas.GEMM(CblasNoTrans, + CblasNoTrans, + batch_size, + y_dim, + x_dim, + 1, + x_scale.data(), + weight_i.data(), + 1, + dy->data()); + } + if (dweight) { + DenseTensor dweight_i = + dweight->Slice(i, i + 1).Resize(make_ddim({x_dim, y_dim})); + blas.GEMM(CblasTrans, + CblasNoTrans, + x_dim, + y_dim, + batch_size, + 1, + x_scale.data(), + y.data(), + 0, + dweight_i.data()); + } + } + } + } + + // calculate the gradient of Input(Bias). + if (dbias) { + ctx.template Alloc(dbias); + auto dbias_mat = EigenVector::Flatten(*dbias); + dbias_mat.device(place) = dout_mat.sum(Eigen::DSizes(0)); + } +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h new file mode 100644 index 0000000000000..3f30a4b958ebe --- /dev/null +++ b/paddle/phi/kernels/impl/bilinear_tensor_product_kernel_impl.h @@ -0,0 +1,75 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/blas/blas.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" +#include "paddle/utils/optional.h" + +namespace phi { + +template +void BilinearTensorProductKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& y, + const DenseTensor& weight, + paddle::optional bias, + DenseTensor* out) { + ctx.template Alloc(out); + + auto y_mat = EigenMatrix::From(y); + auto output_mat = EigenMatrix::From(*out); + + auto batch_size = x.dims()[0]; + auto weight_dims = weight.dims(); + int out_dim = weight_dims[0]; + auto x_dim = weight_dims[1]; + auto y_dim = weight_dims[2]; + auto& place = *ctx.eigen_device(); + + // Create the intermediate variable to calculate the result of + // Input(X) multiplied by Input(Weight_i), the formula is: + // left_mul = X Weight_i. + DenseTensor left_mul; + left_mul.Resize(phi::make_ddim({batch_size, y_dim})); + ctx.template Alloc(&left_mul); + auto left_mul_mat = EigenMatrix::From(left_mul); + + for (int i = 0; i < out_dim; ++i) { + auto output_col_vec = output_mat.chip(i, 1); + DenseTensor weight_mat = + weight.Slice(i, i + 1).Resize(phi::make_ddim({x_dim, y_dim})); + phi::funcs::GetBlas(ctx).GEMM(CblasNoTrans, + CblasNoTrans, + batch_size, + y_dim, + x_dim, + 1, + x.data(), + weight_mat.data(), + 0, + left_mul.data()); + output_col_vec.device(place) = + (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); + } + if (bias.get_ptr()) { + auto bias_vec = EigenMatrix::From(*(bias.get_ptr())); + Eigen::DSizes bcast(batch_size, 1); + output_mat.device(place) = bias_vec.broadcast(bcast) + output_mat; + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc new file mode 100644 index 0000000000000..570bf7ce943d6 --- /dev/null +++ b/paddle/phi/ops/compat/bilinear_tensor_product_sig.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BilinearTensorProductOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "bilinear_tensor_product", {"X", "Y", "Weight", "Bias"}, {}, {"Out"}); +} + +KernelSignature BilinearTensorProductGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("bilinear_tensor_product_grad", + {"X", "Y", "Weight", GradVarName("Out")}, + {}, + {GradVarName("X"), + GradVarName("Y"), + GradVarName("Weight"), + GradVarName("Bias")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product, + phi::BilinearTensorProductOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(bilinear_tensor_product_grad, + phi::BilinearTensorProductGradOpArgumentMapping); From caea126c689ca9d764658664cca1fbd0992ef780 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 26 Feb 2022 13:35:57 +0800 Subject: [PATCH 68/85] Support custom implement for C++ API (#39521) * Support custom implement for C++ API * rename api_invoke_impl to api_custom_impl * remove manual_api * delete mutable_data in copy_to api * fix problem of copy_to * add unittest for infer_meta_fn_factory * fix split cofig in yaml * fix split cofig in yaml * modify sum api yaml * add copy_to wrapped infermeta * rollback copy impl --- paddle/phi/api/CMakeLists.txt | 2 +- paddle/phi/api/all.h | 1 - paddle/phi/api/lib/CMakeLists.txt | 25 ++++--- .../lib/{manual_api.cc => api_custom_impl.cc} | 58 +++++---------- .../manual_api.h => lib/api_custom_impl.h} | 17 ++--- paddle/phi/api/lib/api_declare.h | 1 - paddle/phi/api/lib/tensor.cc | 70 +------------------ paddle/phi/api/lib/tensor_method.cc | 68 ++++++++++++++++++ paddle/phi/kernels/CMakeLists.txt | 3 +- paddle/phi/tests/api/CMakeLists.txt | 4 +- paddle/phi/tests/api/test_data_transform.cc | 1 - paddle/phi/tests/api/test_split_api.cc | 1 - paddle/phi/tests/api/test_to_api.cc | 2 +- .../phi/tests/kernels/test_split_dev_api.cc | 1 - python/paddle/utils/code_gen/api.yaml | 11 ++- python/paddle/utils/code_gen/api_gen.py | 1 + .../paddle/utils/code_gen/backward_api_gen.py | 1 + 17 files changed, 122 insertions(+), 145 deletions(-) rename paddle/phi/api/lib/{manual_api.cc => api_custom_impl.cc} (73%) rename paddle/phi/api/{include/manual_api.h => lib/api_custom_impl.h} (61%) diff --git a/paddle/phi/api/CMakeLists.txt b/paddle/phi/api/CMakeLists.txt index c2ba5d406ba7b..d632db046d15c 100644 --- a/paddle/phi/api/CMakeLists.txt +++ b/paddle/phi/api/CMakeLists.txt @@ -1,2 +1,2 @@ add_subdirectory(lib) -cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api manual_api sparse_api) +cc_library(phi_api SRCS all.cc DEPS phi_function_api phi_bw_function_api sparse_api) diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index 06f3cd8447606..748ed11058af6 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -26,7 +26,6 @@ limitations under the License. */ // new pten apis #include "paddle/phi/api/include/api.h" -#include "paddle/phi/api/include/manual_api.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/tensor.h" diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt index d50f62d309066..5edb83f8c3fc0 100644 --- a/paddle/phi/api/lib/CMakeLists.txt +++ b/paddle/phi/api/lib/CMakeLists.txt @@ -3,11 +3,11 @@ add_subdirectory(utils) cc_library(ext_compat_utils SRCS ext_compat_utils.cc DEPS place) if (WITH_GPU) - nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) + nv_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) elseif (WITH_ROCM) - hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) + hip_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) else() - cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce manual_api) + cc_library(phi_tensor_raw SRCS tensor.cc DEPS tensor_base dense_tensor phi_api_utils ext_compat_utils phi_enforce) endif() set(api_gen_base ${CMAKE_SOURCE_DIR}/python/paddle/utils/code_gen/api_base.py) @@ -83,17 +83,16 @@ add_custom_command( DEPENDS ${api_yaml_file} ${wrapped_infermeta_gen_file} ${api_gen_base} VERBATIM) +cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor_raw) +cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) + cc_library(kernel_dispatch SRCS kernel_dispatch.cc DEPS phi_tensor_raw phi_context kernel_factory) cc_library(phi_data_transform SRCS data_transform.cc DEPS phi_tensor_raw transfer_layout_kernel cast_kernel data_device_transform) -cc_library(manual_api SRCS manual_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) - -cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) +cc_library(api_custom_impl SRCS api_custom_impl.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) -cc_library(op_meta_info SRCS op_meta_info.cc DEPS phi_tensor) +cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) +cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform api_custom_impl) +cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch phi_data_transform) +cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor_raw phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api api_custom_impl) -cc_library(wrapped_infermeta SRCS ${wrapped_infermeta_source_file} DEPS phi) - -cc_library(sparse_api SRCS sparse_api.cc DEPS phi_tensor phi kernel_dispatch phi_data_transform) -cc_library(phi_function_api SRCS ${api_source_file} DEPS phi_tensor phi kernel_dispatch phi_data_transform wrapped_infermeta) -cc_library(phi_dygraph_api SRCS ${dygraph_api_source_file} DEPS phi_tensor phi kernel_dispatch phi_data_transform) -cc_library(phi_bw_function_api SRCS ${bw_api_source_file} DEPS phi_tensor phi kernel_dispatch backward_infermeta phi_data_transform phi_function_api) +cc_library(phi_tensor SRCS tensor_method.cc DEPS phi_tensor_raw phi_function_api) diff --git a/paddle/phi/api/lib/manual_api.cc b/paddle/phi/api/lib/api_custom_impl.cc similarity index 73% rename from paddle/phi/api/lib/manual_api.cc rename to paddle/phi/api/lib/api_custom_impl.cc index 7bd4711cc3f30..66dba2cc2e1b0 100644 --- a/paddle/phi/api/lib/manual_api.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/phi/api/include/manual_api.h" - -#include - -#include "glog/logging.h" +#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_utils.h" @@ -25,23 +21,17 @@ limitations under the License. */ #include "paddle/phi/api/lib/utils/storage.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/meta_tensor.h" +#include "paddle/phi/infermeta/binary.h" +#include "paddle/phi/infermeta/multiary.h" +#include "paddle/phi/infermeta/nullary.h" #include "paddle/phi/infermeta/unary.h" -PD_DECLARE_KERNEL(copy, CPU, ALL_LAYOUT); -PD_DECLARE_KERNEL(split, CPU, ALL_LAYOUT); - -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PD_DECLARE_KERNEL(copy, GPU, ALL_LAYOUT); -#endif - -#ifdef PADDLE_WITH_XPU -PD_DECLARE_KERNEL(copy, XPU, ALL_LAYOUT); -#endif +#include "glog/logging.h" namespace paddle { namespace experimental { -PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) { +Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); @@ -79,28 +69,15 @@ PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking) { return out; } -PADDLE_API std::vector split(const Tensor& x, - const ScalarArray& num_or_sections, - const Scalar& axis) { - Backend kernel_backend = Backend::UNDEFINED; - DataLayout kernel_layout = DataLayout::UNDEFINED; - DataType kernel_data_type = DataType::UNDEFINED; - - if (kernel_backend == Backend::UNDEFINED || - kernel_layout == DataLayout::UNDEFINED || - kernel_data_type == DataType::UNDEFINED) { - auto kernel_key_set = ParseKernelKeyByInputArgs(x); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); - if (kernel_backend == Backend::UNDEFINED) { - kernel_backend = kernel_key.backend(); - } - if (kernel_layout == DataLayout::UNDEFINED) { - kernel_layout = kernel_key.layout(); - } - if (kernel_data_type == DataType::UNDEFINED) { - kernel_data_type = kernel_key.dtype(); - } - } +std::vector split_impl(const Tensor& x, + const ScalarArray& num_or_sections, + const Scalar& axis) { + auto kernel_key_set = ParseKernelKeyByInputArgs(x); + auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + + Backend kernel_backend = kernel_key.backend(); + DataLayout kernel_layout = kernel_key.layout(); + DataType kernel_data_type = kernel_key.dtype(); auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( "split", {kernel_backend, kernel_layout, kernel_data_type}); @@ -144,7 +121,6 @@ PADDLE_API std::vector split(const Tensor& x, return out; } + } // namespace experimental } // namespace paddle - -PD_REGISTER_API(Utils); diff --git a/paddle/phi/api/include/manual_api.h b/paddle/phi/api/lib/api_custom_impl.h similarity index 61% rename from paddle/phi/api/include/manual_api.h rename to paddle/phi/api/lib/api_custom_impl.h index 72d348f33918c..5acb68a328133 100644 --- a/paddle/phi/api/include/manual_api.h +++ b/paddle/phi/api/lib/api_custom_impl.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -19,22 +19,15 @@ limitations under the License. */ #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" -/** - * This file stores some special APIs that are implemented manually - * or difficult to automatically generated. - */ - namespace paddle { namespace experimental { // TODO(chenweihang): Replace backend by place when place is ready -PADDLE_API Tensor copy_to(const Tensor& x, Backend backend, bool blocking); +Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking); -// TODO(chentianyu03): Split API has extra logic to calculate the outputs size, -// api_gen do not support -PADDLE_API std::vector split(const Tensor& x, - const ScalarArray& num_or_sections, - const Scalar& axis); +std::vector split_impl(const Tensor& x, + const ScalarArray& num_or_sections, + const Scalar& axis); } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_declare.h b/paddle/phi/api/lib/api_declare.h index 26408290bd325..a5d3578d681b6 100644 --- a/paddle/phi/api/lib/api_declare.h +++ b/paddle/phi/api/lib/api_declare.h @@ -18,5 +18,4 @@ limitations under the License. */ #include "paddle/phi/api/lib/api_registry.h" PD_DECLARE_API(Math); -PD_DECLARE_API(Utils); PD_DECLARE_API(SparseApi); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index 963aeec328e2a..ada08019f678a 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -19,7 +19,6 @@ limitations under the License. */ #include #include "glog/logging.h" -#include "paddle/phi/api/include/manual_api.h" #include "paddle/phi/api/lib/ext_compat_utils.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/storage.h" @@ -299,72 +298,7 @@ gpuStream_t Tensor::stream() const { } #endif -/* Part 5: Data Transform methods */ - -template -Tensor Tensor::copy_to(const PlaceType &target_place) const { - LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version " - "2.3, and will be removed in version 2.4, please use " - "`copy_to` method without template argument instead. " - "reason: copying a Tensor to another device does not need " - "to specify the data type template argument."; - return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false); -} - -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; -template PADDLE_API Tensor Tensor::copy_to>( - const PlaceType &target_place) const; -template PADDLE_API Tensor -Tensor::copy_to(const PlaceType &target_place) const; - -Tensor Tensor::copy_to(Backend backend, bool blocking) const { - return experimental::copy_to(*this, backend, blocking); -} - -void Tensor::copy_(const Tensor &src, bool blocking) { - if (!src.is_initialized()) { - return; - } - VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); - if (defined()) { - PADDLE_ENFORCE_EQ(dtype(), - src.dtype(), - platform::errors::PreconditionNotMet( - "Tensor %s has different data type with Tensor %s, " - "Tensor Copy cannot be performed!", - name(), - src.name())); - PADDLE_ENFORCE_EQ(impl()->type_info().id(), - src.impl()->type_info().id(), - platform::errors::PreconditionNotMet( - "Tensor %s has different type with Tensor %s, Tensor " - "Copy cannot be performed!", - name(), - src.name())); - } - auto copy_tensor = - src.copy_to(phi::TransToPtenBackend(src.inner_place()), blocking); - set_impl(copy_tensor.impl()); -} - -/* Part 6: Status utils methods */ +/* Part 5: Status utils methods */ bool Tensor::defined() const { return impl_ != nullptr; } @@ -376,7 +310,7 @@ bool Tensor::is_initialized() const { void Tensor::reset() { impl_.reset(); } -/* Part 7: Operator overloading */ +/* Part 6: Operator overloading */ Tensor &Tensor::operator=(const Tensor &x) & { impl_ = x.impl_; diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index b67810d610f2f..7308a9d752c7a 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -14,15 +14,83 @@ limitations under the License. */ #include "paddle/phi/api/include/tensor.h" +#include "paddle/phi/api/lib/ext_compat_utils.h" +#include "paddle/phi/core/compat/convert_utils.h" +#include "paddle/phi/core/tensor_base.h" + namespace paddle { namespace experimental { // declare cast api Tensor cast(const Tensor &x, DataType out_dtype); +Tensor copy_to(const Tensor &x, Backend backend, bool blocking); Tensor Tensor::cast(DataType target_type) const { return experimental::cast(*this, target_type); } +Tensor Tensor::copy_to(Backend backend, bool blocking) const { + return experimental::copy_to(*this, backend, blocking); +} + +template +Tensor Tensor::copy_to(const PlaceType &target_place) const { + LOG(WARNING) << "The Tensor's `copy_to` method is deprecated since version " + "2.3, and will be removed in version 2.4, please use " + "`copy_to` method without template argument instead. " + "reason: copying a Tensor to another device does not need " + "to specify the data type template argument."; + return copy_to(ConvertExtPlaceToBackend(target_place), /*blocking=*/false); +} + +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; +template PADDLE_API Tensor Tensor::copy_to>( + const PlaceType &target_place) const; +template PADDLE_API Tensor Tensor::copy_to>( + const PlaceType &target_place) const; +template PADDLE_API Tensor +Tensor::copy_to(const PlaceType &target_place) const; + +void Tensor::copy_(const Tensor &src, bool blocking) { + if (!src.is_initialized()) { + return; + } + VLOG(3) << "Deep copy Tensor from " << src.name() << " to " << name(); + if (defined()) { + PADDLE_ENFORCE_EQ(dtype(), + src.dtype(), + platform::errors::PreconditionNotMet( + "Tensor %s has different data type with Tensor %s, " + "Tensor Copy cannot be performed!", + name(), + src.name())); + PADDLE_ENFORCE_EQ(impl()->type_info().id(), + src.impl()->type_info().id(), + platform::errors::PreconditionNotMet( + "Tensor %s has different type with Tensor %s, Tensor " + "Copy cannot be performed!", + name(), + src.name())); + } + auto copy_tensor = + src.copy_to(phi::TransToPtenBackend(src.inner_place()), blocking); + set_impl(copy_tensor.impl()); +} + } // namespace experimental } // namespace paddle diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index f27adf1de149b..1523401d1913a 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -18,7 +18,8 @@ set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) # NOTE: Some kernels depend on some targets that are not commonly used. # These targets are not suitable for common dependencies. # In this case, you need to manually generate them here. -set(MANUAL_BUILD_KERNELS softmax_kernel softmax_grad_kernel) +set(MANUAL_BUILD_KERNELS math_kernel softmax_kernel softmax_grad_kernel) +kernel_library(math_kernel DEPS ${COMMON_KERNEL_DEPS} cast_kernel copy_kernel) kernel_library(softmax_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) kernel_library(softmax_grad_kernel DEPS ${COMMON_KERNEL_DEPS} softmax) diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt index ba3fe8d57b31b..cde085423e482 100644 --- a/paddle/phi/tests/api/CMakeLists.txt +++ b/paddle/phi/tests/api/CMakeLists.txt @@ -1,7 +1,7 @@ if(WITH_ROCM) - hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api manual_api glog) + hip_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog) else() - cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api manual_api glog) + cc_test(test_phi_tensor SRCS test_pten_tensor.cc DEPS phi_tensor phi_function_api glog) endif() cc_test(test_phi_exception SRCS test_pten_exception.cc DEPS gtest) diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc index 3df1866efb0df..2e38a1593461e 100644 --- a/paddle/phi/tests/api/test_data_transform.cc +++ b/paddle/phi/tests/api/test_data_transform.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include "paddle/phi/api/include/api.h" -#include "paddle/phi/api/include/manual_api.h" #include "paddle/phi/common/complex.h" #include "paddle/phi/core/compat/convert_utils.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/tests/api/test_split_api.cc b/paddle/phi/tests/api/test_split_api.cc index 9b3478e85e04c..0b836a010586d 100644 --- a/paddle/phi/tests/api/test_split_api.cc +++ b/paddle/phi/tests/api/test_split_api.cc @@ -17,7 +17,6 @@ #include "paddle/phi/api/include/api.h" -#include "paddle/phi/api/include/manual_api.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/kernel_registry.h" diff --git a/paddle/phi/tests/api/test_to_api.cc b/paddle/phi/tests/api/test_to_api.cc index c790e7bfa71da..d337a0b601a00 100644 --- a/paddle/phi/tests/api/test_to_api.cc +++ b/paddle/phi/tests/api/test_to_api.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include -#include "paddle/phi/api/include/manual_api.h" +#include "paddle/phi/api/include/api.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc index e6d6263128ec9..d5160933c1fa0 100644 --- a/paddle/phi/tests/kernels/test_split_dev_api.cc +++ b/paddle/phi/tests/kernels/test_split_dev_api.cc @@ -18,7 +18,6 @@ limitations under the License. */ #include "paddle/phi/kernels/split_kernel.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" -#include "paddle/phi/api/include/manual_api.h" #include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/core/dense_tensor.h" diff --git a/python/paddle/utils/code_gen/api.yaml b/python/paddle/utils/code_gen/api.yaml index 390ccdd157363..7ea8493b67fd6 100644 --- a/python/paddle/utils/code_gen/api.yaml +++ b/python/paddle/utils/code_gen/api.yaml @@ -34,6 +34,11 @@ kernel : func : conj +- api : copy_to + args : (Tensor x, Backend backend, bool blocking) + output : Tensor + invoke : copy_to_impl(x, backend, blocking) + - api : divide args : (Tensor x, Tensor y) output : Tensor @@ -162,6 +167,11 @@ kernel : func : sign +- api : split + args : (Tensor x, ScalarArray num_or_sections, Scalar axis) + output : Tensor[] + invoke : split_impl(x, num_or_sections, axis) + - api : subtract args : (Tensor x, Tensor y) output : Tensor @@ -177,7 +187,6 @@ func : SumInferMeta kernel : func : sum - param : [x, axis, dtype, keep_dim] data_type : x - api : zeros_like diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index f1e69a21f28d8..77af217f7b52e 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -102,6 +102,7 @@ def source_include(header_file_path): #include "glog/logging.h" +#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/data_transform.h" diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index 28eb1de37b697..bde5d4c90b907 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -142,6 +142,7 @@ def source_include(header_file_path): #include "glog/logging.h" +#include "paddle/phi/api/lib/api_custom_impl.h" #include "paddle/phi/api/lib/api_registry.h" #include "paddle/phi/api/lib/api_utils.h" #include "paddle/phi/api/lib/data_transform.h" From 581b2c64b18fd5968cde2d158dca4327ca20b27a Mon Sep 17 00:00:00 2001 From: From00 Date: Sat, 26 Feb 2022 14:07:12 +0800 Subject: [PATCH 69/85] Move GumbelSoftmax OP to phi (#39873) * Move GumbelSoftmax OP to phi * platform::errors -> phi::errors; GumbelSoftmaxGradInferMeta -> backend.h/cc * Use axis util in kernel impl * Remove namespace platform::errors * Use GetCPUEngine in Device Context --- paddle/fluid/operators/gumbel_softmax_op.cc | 48 ++-- paddle/fluid/operators/gumbel_softmax_op.cu | 172 ------------ paddle/fluid/operators/gumbel_softmax_op.h | 249 ------------------ paddle/phi/infermeta/backward.cc | 12 + paddle/phi/infermeta/backward.h | 4 + paddle/phi/infermeta/unary.cc | 32 +++ paddle/phi/infermeta/unary.h | 11 + paddle/phi/kernels/CMakeLists.txt | 2 +- .../kernels/cpu/gumbel_softmax_grad_kernel.cc | 25 ++ .../phi/kernels/cpu/gumbel_softmax_kernel.cc | 121 +++++++++ .../kernels/gpu/gumbel_softmax_grad_kernel.cu | 25 ++ .../phi/kernels/gpu/gumbel_softmax_kernel.cu | 181 +++++++++++++ .../phi/kernels/gumbel_softmax_grad_kernel.h | 27 ++ paddle/phi/kernels/gumbel_softmax_kernel.h | 28 ++ .../impl/gumbel_softmax_grad_kernel_impl.h | 50 ++++ .../kernels/impl/gumbel_softmax_kernel_impl.h | 96 +++++++ paddle/phi/ops/compat/gumbel_softmax_sig.cc | 30 +++ 17 files changed, 658 insertions(+), 455 deletions(-) delete mode 100644 paddle/fluid/operators/gumbel_softmax_op.cu delete mode 100644 paddle/fluid/operators/gumbel_softmax_op.h create mode 100644 paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc create mode 100644 paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu create mode 100644 paddle/phi/kernels/gumbel_softmax_grad_kernel.h create mode 100644 paddle/phi/kernels/gumbel_softmax_kernel.h create mode 100644 paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h create mode 100644 paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h create mode 100644 paddle/phi/ops/compat/gumbel_softmax_sig.cc diff --git a/paddle/fluid/operators/gumbel_softmax_op.cc b/paddle/fluid/operators/gumbel_softmax_op.cc index 95c6ed6690541..f8f8f3fd789ad 100644 --- a/paddle/fluid/operators/gumbel_softmax_op.cc +++ b/paddle/fluid/operators/gumbel_softmax_op.cc @@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/operators/gumbel_softmax_op.h" -#include -#include -#include "paddle/fluid/operators/common_infer_shape_functions.h" +#include "paddle/fluid/framework/infershape_utils.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/phi/core/infermeta_utils.h" +#include "paddle/phi/infermeta/backward.h" +#include "paddle/phi/infermeta/unary.h" namespace paddle { namespace operators { @@ -23,10 +24,6 @@ class GumbelSoftmaxOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - return UnaryOpUnchangedInferShapeCheckAxis(ctx); - } - protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { @@ -71,20 +68,6 @@ Samples from the Gumbel-Softmax distribution and optionally discretizes. class GumbelSoftmaxGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - void InferShape(framework::InferShapeContext* ctx) const override { - OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "gumbel_softmax_grad"); - OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", - "Out@GRAD", "gumbel_softmax_grad"); - PADDLE_ENFORCE_EQ( - ctx->GetInputDim("Out"), - ctx->GetInputDim(framework::GradVarName("Out")), - platform::errors::InvalidArgument("Input(Out) and its gradients " - "should have the same shape.")); - - ctx->SetOutputDim(framework::GradVarName("X"), - ctx->GetInputDim(framework::GradVarName("Out"))); - } }; template @@ -107,17 +90,16 @@ class GumbelSoftmaxGradOpMaker : public framework::SingleGradOpMaker { namespace ops = paddle::operators; +DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax, GumbelSoftmaxInferShapeFunctor, + PT_INFER_META(phi::GumbelSoftmaxInferMeta)); +DELCARE_INFER_SHAPE_FUNCTOR(gumbel_softmax_grad, + GumbelSoftmaxGradInferShapeFunctor, + PT_INFER_META(phi::GumbelSoftmaxGradInferMeta)); + REGISTER_OPERATOR(gumbel_softmax, ops::GumbelSoftmaxOp, ops::GumbelSoftmaxOpMaker, ops::GumbelSoftmaxGradOpMaker, - ops::GumbelSoftmaxGradOpMaker); -REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp); - -REGISTER_OP_CPU_KERNEL( - gumbel_softmax, - ops::GumbelSoftmaxKernel, - ops::GumbelSoftmaxKernel); -REGISTER_OP_CPU_KERNEL( - gumbel_softmax_grad, - ops::GumbelSoftmaxGradKernel, - ops::GumbelSoftmaxGradKernel); + ops::GumbelSoftmaxGradOpMaker, + GumbelSoftmaxInferShapeFunctor); +REGISTER_OPERATOR(gumbel_softmax_grad, ops::GumbelSoftmaxGradOp, + GumbelSoftmaxGradInferShapeFunctor); diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu deleted file mode 100644 index 880e3eb9f3f9a..0000000000000 --- a/paddle/fluid/operators/gumbel_softmax_op.cu +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/gumbel_softmax_op.h" - -#if defined(__NVCC__) || defined(__HIPCC__) -#ifdef __NVCC__ -#include "cub/cub.cuh" -#endif -#ifdef __HIPCC__ -#include -namespace cub = hipcub; -#endif - -#include -#include -#include -#include -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/memory/memcpy.h" - -namespace paddle { -namespace operators { - -template -using KeyValuePair = cub::KeyValuePair; - -template -struct UniformCUDAGenerator { - T min_, max_; - unsigned int seed_; - unsigned int offset_ = 0; - HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed) - : min_(min), max_(max), seed_(seed) {} - HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed, - unsigned int offset) - : min_(min), max_(max), seed_(seed), offset_(offset) {} - - HOSTDEVICE T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed_); - thrust::uniform_real_distribution dist(min_, max_); - rng.discard(n + offset_); - return dist(rng); - } -}; - -template -__global__ void OneHotCUDAKernel(const int64_t height, const int64_t width, - const int64_t size_out_axis, const T init, - const T* in, T* out) { - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage temp_storage; - - for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) { - KeyValuePair kv_pair = {-1, init}; - int h = idx / size_out_axis; - int w = idx % size_out_axis; - cub::ArgMax reducer; - for (int k = threadIdx.x; k < width; k += blockDim.x) { - kv_pair = reducer( - {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair); - } - kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); - if (threadIdx.x == 0) { - int index = static_cast(kv_pair.key); - out[h * width * size_out_axis + index * size_out_axis + w] = 1; - } - __syncthreads(); - } -} - -template -struct OneHotGenerator { - static void Transform(const platform::CUDADeviceContext& context, - const Tensor& X, Tensor* Out, int axis) { - const int size_to_axis = SizeToAxis(axis, X.dims()); - const int size_from_axis = SizeFromAxis(axis, X.dims()); - const int size_out_axis = SizeOutAxis(axis, X.dims()); - constexpr int thread_size = 512; - int64_t max_grid_dimx = context.GetCUDAMaxGridDimSize()[0]; - int64_t height = size_to_axis * size_out_axis; - int block_size = height < max_grid_dimx ? height : max_grid_dimx; - - Tensor input_tensor; - input_tensor.mutable_data(Out->dims(), platform::CUDAPlace()); - paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor); - phi::funcs::set_constant(context, Out, 0.0); - OneHotCUDAKernel< - T, thread_size><<>>( - height, size_from_axis / size_out_axis, size_out_axis, - std::numeric_limits::lowest(), input_tensor.data(), - Out->data()); - } -}; - -template -__global__ void AddGumbelNoiseCUDAKernel(const T* input_data, T* output_data, - T* noise, const float temperature, - int64_t n) { - int index = threadIdx.x + blockIdx.x * blockDim.x; - int step = blockDim.x * gridDim.x; - for (int64_t i = index; i < n; i += step) { - T gumbel_noise = -log(-log(noise[i])); - output_data[i] = (gumbel_noise + input_data[i]) / temperature; - } -} - -template -struct GumbleNoiseGenerator { - static void Transform(const platform::CUDADeviceContext& context, - const T* input_data, T* output_data, int size_to_axis, - int size_from_axis, const float temperature) { - Tensor random_tensor; - int64_t size = size_to_axis * size_from_axis; - T* random_data = - random_tensor.mutable_data({size}, platform::CUDAPlace()); - thrust::counting_iterator index_sequence_begin(0); - - // generate gumbel noise - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - if (gen_cuda->GetIsInitPy()) { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(random_data), - UniformCUDAGenerator(0.00001, 1, seed_offset.first, gen_offset)); - } else { - const unsigned int seed = std::random_device()(); - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(random_data), - UniformCUDAGenerator(0.00001, 1, seed)); - } - - // add gumbel noise to X - const int thread_size = 512; - int64_t block_size = (size + thread_size) / thread_size; - AddGumbelNoiseCUDAKernel< - T><<>>( - input_data, output_data, random_data, temperature, size); - } -}; - -#endif -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_CUDA_KERNEL( - gumbel_softmax, ops::GumbelSoftmaxKernel, - ops::GumbelSoftmaxKernel); -REGISTER_OP_CUDA_KERNEL( - gumbel_softmax_grad, - ops::GumbelSoftmaxGradKernel, - ops::GumbelSoftmaxGradKernel); diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h deleted file mode 100644 index daddd13d7be5e..0000000000000 --- a/paddle/fluid/operators/gumbel_softmax_op.h +++ /dev/null @@ -1,249 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/operators/math/softmax.h" -#include "paddle/phi/kernels/funcs/math_function.h" - -namespace paddle { -namespace operators { - -using Tensor = framework::Tensor; -using DDim = framework::DDim; - -template -using EigenMatrix = framework::EigenMatrix; - -template -using EigenTensor = framework::EigenTensor; - -static inline int CanonicalAxis(const int axis, const int rank) { - if (axis < 0) { - return axis + rank; - } - return axis; -} - -static inline int SizeToAxis(const int axis, DDim dims) { - int size = 1; - for (int i = 0; i < axis; i++) { - size *= dims[i]; - } - return size; -} - -static inline int SizeFromAxis(const int axis, DDim dims) { - int size = 1; - for (int i = axis; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -static inline int SizeOutAxis(const int axis, DDim dims) { - int size = 1; - for (int i = axis + 1; i < dims.size(); i++) { - size *= dims[i]; - } - return size; -} - -template -struct ArgMaxFunctor { - void operator()(const DeviceContext& ctx, const Tensor& in, - Tensor* index_tensor, const int64_t& axis) { - auto in_eigen = EigenTensor::From(in, in.dims()); - auto index_eigen = EigenTensor::From(*index_tensor); - index_eigen = in_eigen.argmax(axis).template cast(); - } -}; -template -struct GumbleNoiseGenerator; - -template -struct OneHotGenerator; - -template -struct GumbleNoiseGenerator { - static void Transform(const platform::CPUDeviceContext& context, - const T* input_data, T* output_data, int size_to_axis, - int size_from_axis, const float temperature) { - // generate uniform random number - const int size = size_to_axis * size_from_axis; - std::uniform_real_distribution dist(0.00001, 1); - auto engine = paddle::framework::GetCPURandomEngine(0); - Tensor random_tensor; - auto* random_data = - random_tensor.mutable_data({size}, platform::CPUPlace()); - for (int64_t i = 0; i < size; ++i) { - random_data[i] = dist(*engine); - } - - // generate gumbel noise - framework::DDim dim_2d{size_to_axis, size_from_axis}; - auto gumbel_noise_eigen = EigenMatrix::From(random_tensor, dim_2d); - gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log())); - - // add noise - for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) { - output_data[i] = (input_data[i] + random_data[i]) / temperature; - } - } -}; -template -struct OneHotGenerator { - static void Transform(const platform::CPUDeviceContext& context, - const Tensor& X, Tensor* Out, int axis) { - Tensor index; - std::vector index_dim; - const auto rank = X.dims().size(); - const int size_to_axis = SizeToAxis(axis, X.dims()); - const int size_from_axis = SizeFromAxis(axis, X.dims()); - const int size_out_axis = SizeOutAxis(axis, X.dims()); - - for (int i = 0; i < X.dims().size(); i++) { - if (i != axis) index_dim.push_back(X.dims().Get()[i]); - } - DDim index_ddim(index_dim.data(), rank - 1); - index.Resize(index_ddim); - auto* index_data = index.mutable_data(context.GetPlace()); - -#define CALL_ARG_MINMAX_FUNCTOR(rank) \ - ArgMaxFunctor functor##rank; \ - functor##rank(context, *Out, &index, axis); - switch (Out->dims().size()) { - case 1: - CALL_ARG_MINMAX_FUNCTOR(1); - break; - case 2: - CALL_ARG_MINMAX_FUNCTOR(2); - break; - case 3: - CALL_ARG_MINMAX_FUNCTOR(3); - break; - case 4: - CALL_ARG_MINMAX_FUNCTOR(4); - break; - case 5: - CALL_ARG_MINMAX_FUNCTOR(5); - break; - case 6: - CALL_ARG_MINMAX_FUNCTOR(6); - break; - default: - PADDLE_ENFORCE_LE(Out->dims().size(), 6, - platform::errors::InvalidArgument( - "gumbel_softmax operator doesn't supports " - "tensors whose ranks are greater " - "than 6 in CPU mode.")); - break; -#undef CALL_ARG_MINMAX_FUNCTOR - } - - phi::funcs::set_constant(context, Out, 0.0); - for (int i = 0; i < size_to_axis; i++) { - for (int j = 0; j < size_out_axis; j++) { - *(Out->data() + i * size_from_axis + j + - index_data[i * size_out_axis + j] * size_out_axis) = 1.0; - } - } - } -}; - -template -class GumbelSoftmaxKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* X = context.Input("X"); - auto* Out = context.Output("Out"); - const int rank = X->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = X->dims()[axis]; - const bool is_hard = context.Attr("hard"); - const float temperature = context.Attr("temperature"); - PADDLE_ENFORCE_GT(temperature, 0, - platform::errors::InvalidArgument( - "The temperature must be greater than 0. But " - "received temperature = %f", - temperature)); - - // allocate memory on device. - Out->mutable_data(context.GetPlace()); - if (Out->numel() == 0) { - return; - } - - const int size_to_axis = SizeToAxis(axis, X->dims()); - const int size_from_axis = SizeFromAxis(axis, X->dims()); - Tensor X_noise_2d, Out_2d; - X_noise_2d.Resize({size_to_axis, size_from_axis}); - Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis}); - - // generate gumbel noise and add it to X - auto* x_noise_data = X_noise_2d.mutable_data(context.GetPlace()); - GumbleNoiseGenerator::Transform( - context.template device_context(), X->data(), - x_noise_data, size_to_axis, size_from_axis, temperature); - -#ifdef PADDLE_ON_INFERENCE - math::SoftmaxFunctor()( - context.template device_context(), axis_dim, &X_noise_2d, - &Out_2d); -#else - math::SoftmaxFunctor()( - context.template device_context(), axis_dim, &X_noise_2d, - &Out_2d); -#endif - - if (is_hard) { - OneHotGenerator::Transform( - context.template device_context(), *X, Out, axis); - } - } -}; - -template -class GumbelSoftmaxGradKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); - const int rank = dX->dims().size(); - const int axis = CanonicalAxis(context.Attr("axis"), rank); - int axis_dim = dX->dims()[axis]; - // allocate memory on device. - dX->mutable_data(context.GetPlace()); - if (dX->numel() == 0) { - return; - } - - const int size_to_axis = SizeToAxis(axis, dX->dims()); - const int size_from_axis = SizeFromAxis(axis, dX->dims()); - Tensor dX_2d, Out_2d, dOut_2d; - dX_2d.ShareDataWith(*dX).Resize({size_to_axis, size_from_axis}); - Out_2d.ShareDataWith(*Out).Resize({size_to_axis, size_from_axis}); - dOut_2d.ShareDataWith(*dOut).Resize({size_to_axis, size_from_axis}); - math::SoftmaxGradFunctor()( - context.template device_context(), axis_dim, &Out_2d, - &dOut_2d, &dX_2d); - } -}; - -} // namespace operators -} // namespace paddle diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index e08eae0fc68f4..643a6dc9ddf36 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -76,4 +76,16 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x, } } +void GumbelSoftmaxGradInferMeta(const MetaTensor& out, + const MetaTensor& dout, + int axis, + MetaTensor* dx) { + PADDLE_ENFORCE_EQ( + out.dims(), + dout.dims(), + errors::InvalidArgument( + "Input(Out) and its gradients should have the same shape.")); + dx->share_meta(dout); +} + } // namespace phi diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 35f988bbc0b85..5afa678ddac70 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -34,4 +34,8 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x, MetaTensor* dx, MetaTensor* dy); +void GumbelSoftmaxGradInferMeta(const MetaTensor& out, + const MetaTensor& dout, + int axis, + MetaTensor* dx); } // namespace phi diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 9b2f310e85d4b..1a9dbf90dd45f 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -27,6 +27,30 @@ void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out) { out->share_meta(x); } +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out) { + auto rank = x.dims().size(); + PADDLE_ENFORCE_GE( + axis, + -rank, + errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + PADDLE_ENFORCE_LT( + axis, + rank, + phi::errors::InvalidArgument( + "Attr(axis) value should be in range [-R, R-1], " + "R is the rank of Input(X). But received axis: %d, R: %d.", + axis, + rank)); + out->share_meta(x); +} + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, @@ -75,6 +99,14 @@ void FlattenInferMeta(const MetaTensor& x, } } +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out) { + UnchangedInferMetaCheckAxis(x, axis, out); +} + void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out) { out->set_dims(x.dims()); out->set_dtype(out_dtype); diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index 40bf4e333569c..172ea2a565e18 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -34,11 +34,22 @@ class MetaConfig; void UnchangedInferMeta(const MetaTensor& x, MetaTensor* out); +// meta x -> out without change, check if axis in range [-Rank(x), Rank(x)-1] +void UnchangedInferMetaCheckAxis(const MetaTensor& x, + int axis, + MetaTensor* out); + void FlattenInferMeta(const MetaTensor& x, int start_axis, int stop_axis, MetaTensor* out); +void GumbelSoftmaxInferMeta(const MetaTensor& x, + float temperature, + bool hard, + int axis, + MetaTensor* out); + void CastInferMeta(const MetaTensor& x, DataType out_dtype, MetaTensor* out); void CholeskyInferMeta(const MetaTensor& x, bool upper, MetaTensor* out); diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 1523401d1913a..ef51d6daf6a00 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -10,7 +10,7 @@ add_subdirectory(funcs) set_property(GLOBAL PROPERTY PHI_KERNELS "") set(COMMON_KERNEL_DEPS dense_tensor sparse_coo_tensor sparse_csr_tensor kernel_context kernel_factory arg_map_context convert_utils lod_utils) -set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor) +set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} eigen_function blas math_function im2col concat_and_split_functor softmax) # remove this dep after removing fluid deps on tensor creation set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils) set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta) diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc new file mode 100644 index 0000000000000..a4c131e72b59a --- /dev/null +++ b/paddle/phi/kernels/cpu/gumbel_softmax_grad_kernel.cc @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(gumbel_softmax_grad, + CPU, + ALL_LAYOUT, + phi::GumbelSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc new file mode 100644 index 0000000000000..eb406665c5f4f --- /dev/null +++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc @@ -0,0 +1,121 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gumbel_softmax_kernel.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +template +struct GumbleNoiseGenerator { + static void Transform(const CPUContext& ctx, + const T* input_data, + T* output_data, + int size_to_axis, + int size_from_axis, + const float temperature) { + // generate uniform random number + const int size = size_to_axis * size_from_axis; + std::uniform_real_distribution dist(0.00001, 1); + auto engine = ctx.GetGenerator()->GetCPUEngine(); + DenseTensor random_tensor; + random_tensor.Resize(make_ddim({size})); + auto* random_data = ctx.template Alloc(&random_tensor); + for (int64_t i = 0; i < size; ++i) { + random_data[i] = dist(*engine); + } + + // generate gumbel noise + DDim dim_2d{size_to_axis, size_from_axis}; + auto gumbel_noise_eigen = EigenMatrix::From(random_tensor, dim_2d); + gumbel_noise_eigen = -(((-(gumbel_noise_eigen.log())).log())); + + // add noise + for (int64_t i = 0; i < size_to_axis * size_from_axis; i++) { + output_data[i] = (input_data[i] + random_data[i]) / temperature; + } + } +}; + +template +struct OneHotGenerator { + static void Transform(const CPUContext& ctx, + const DenseTensor& x, + DenseTensor* out, + int axis) { + DenseTensor index; + std::vector index_dim; + const auto rank = x.dims().size(); + const int size_to_axis = funcs::SizeToAxis(axis, x.dims()); + const int size_from_axis = funcs::SizeFromAxis(axis, x.dims()); + const int size_out_axis = funcs::SizeOutAxis(axis, x.dims()); + + for (int i = 0; i < x.dims().size(); i++) { + if (i != axis) index_dim.push_back(x.dims().Get()[i]); + } + DDim index_ddim(index_dim.data(), rank - 1); + index.Resize(index_ddim); + auto* index_data = ctx.template Alloc(&index); + +#define CALL_ARG_MINMAX_FUNCTOR(rank) \ + ArgMaxFunctor functor##rank; \ + functor##rank(ctx, *out, &index, axis); + switch (out->dims().size()) { + case 1: + CALL_ARG_MINMAX_FUNCTOR(1); + break; + case 2: + CALL_ARG_MINMAX_FUNCTOR(2); + break; + case 3: + CALL_ARG_MINMAX_FUNCTOR(3); + break; + case 4: + CALL_ARG_MINMAX_FUNCTOR(4); + break; + case 5: + CALL_ARG_MINMAX_FUNCTOR(5); + break; + case 6: + CALL_ARG_MINMAX_FUNCTOR(6); + break; + default: + PADDLE_ENFORCE_LE( + out->dims().size(), + 6, + errors::InvalidArgument("gumbel_softmax operator doesn't supports " + "tensors whose ranks are greater " + "than 6 in CPU mode.")); + break; +#undef CALL_ARG_MINMAX_FUNCTOR + } + + funcs::set_constant(ctx, out, 0.0); + for (int i = 0; i < size_to_axis; i++) { + for (int j = 0; j < size_out_axis; j++) { + *(out->data() + i * size_from_axis + j + + index_data[i * size_out_axis + j] * size_out_axis) = 1.0; + } + } + } +}; + +} // namespace phi + +PD_REGISTER_KERNEL( + gumbel_softmax, CPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu new file mode 100644 index 0000000000000..a28a7512f4986 --- /dev/null +++ b/paddle/phi/kernels/gpu/gumbel_softmax_grad_kernel.cu @@ -0,0 +1,25 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gumbel_softmax_grad_kernel.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" + +PD_REGISTER_KERNEL(gumbel_softmax_grad, + GPU, + ALL_LAYOUT, + phi::GumbelSoftmaxGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu new file mode 100644 index 0000000000000..6b1e58981baa0 --- /dev/null +++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu @@ -0,0 +1,181 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/gumbel_softmax_kernel.h" +#include "paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h" + +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +#if defined(__NVCC__) || defined(__HIPCC__) +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include +#include +#include +#include +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { +template +using KeyValuePair = cub::KeyValuePair; + +template +struct UniformCUDAGenerator { + T min_, max_; + unsigned int seed_; + unsigned int offset_ = 0; + HOSTDEVICE UniformCUDAGenerator(T min, T max, unsigned int seed) + : min_(min), max_(max), seed_(seed) {} + HOSTDEVICE UniformCUDAGenerator(T min, + T max, + unsigned int seed, + unsigned int offset) + : min_(min), max_(max), seed_(seed), offset_(offset) {} + + HOSTDEVICE T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed_); + thrust::uniform_real_distribution dist(min_, max_); + rng.discard(n + offset_); + return dist(rng); + } +}; + +template +__global__ void OneHotCUDAKernel(const int64_t height, + const int64_t width, + const int64_t size_out_axis, + const T init, + const T* in, + T* out) { + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + for (int64_t idx = blockIdx.x; idx < height; idx += gridDim.x) { + KeyValuePair kv_pair = {-1, init}; + int h = idx / size_out_axis; + int w = idx % size_out_axis; + cub::ArgMax reducer; + for (int k = threadIdx.x; k < width; k += blockDim.x) { + kv_pair = reducer( + {k, in[h * width * size_out_axis + k * size_out_axis + w]}, kv_pair); + } + kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer); + if (threadIdx.x == 0) { + int index = static_cast(kv_pair.key); + out[h * width * size_out_axis + index * size_out_axis + w] = 1; + } + __syncthreads(); + } +} + +template +struct OneHotGenerator { + static void Transform(const GPUContext& ctx, + const DenseTensor& X, + DenseTensor* out, + int axis) { + const int size_to_axis = funcs::SizeToAxis(axis, X.dims()); + const int size_from_axis = funcs::SizeFromAxis(axis, X.dims()); + const int size_out_axis = funcs::SizeOutAxis(axis, X.dims()); + constexpr int thread_size = 512; + int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize()[0]; + int64_t height = size_to_axis * size_out_axis; + int block_size = height < max_grid_dimx ? height : max_grid_dimx; + + DenseTensor input_tensor; + input_tensor.Resize(out->dims()); + ctx.template Alloc(&input_tensor); + paddle::framework::TensorCopy(*out, ctx.GetPlace(), &input_tensor); + funcs::set_constant(ctx, out, 0.0); + OneHotCUDAKernel<<>>( + height, + size_from_axis / size_out_axis, + size_out_axis, + std::numeric_limits::lowest(), + input_tensor.data(), + out->data()); + } +}; + +template +__global__ void AddGumbelNoiseCUDAKernel(const T* input_data, + T* output_data, + T* noise, + const float temperature, + int64_t n) { + int index = threadIdx.x + blockIdx.x * blockDim.x; + int step = blockDim.x * gridDim.x; + for (int64_t i = index; i < n; i += step) { + T gumbel_noise = -log(-log(noise[i])); + output_data[i] = (gumbel_noise + input_data[i]) / temperature; + } +} + +template +struct GumbleNoiseGenerator { + static void Transform(const GPUContext& ctx, + const T* input_data, + T* output_data, + int size_to_axis, + int size_from_axis, + const float temperature) { + DenseTensor random_tensor; + int64_t size = size_to_axis * size_from_axis; + random_tensor.Resize(make_ddim({size})); + auto* random_data = ctx.template Alloc(&random_tensor); + thrust::counting_iterator index_sequence_begin(0); + + // generate gumbel noise + int device_id = ctx.GetPlace().GetDeviceId(); + auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); + if (gen_cuda->GetIsInitPy()) { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + thrust::transform( + index_sequence_begin, + index_sequence_begin + size, + thrust::device_ptr(random_data), + UniformCUDAGenerator(0.00001, 1, seed_offset.first, gen_offset)); + } else { + const unsigned int seed = std::random_device()(); + thrust::transform(index_sequence_begin, + index_sequence_begin + size, + thrust::device_ptr(random_data), + UniformCUDAGenerator(0.00001, 1, seed)); + } + + // add gumbel noise to X + const int thread_size = 512; + int64_t block_size = (size + thread_size) / thread_size; + AddGumbelNoiseCUDAKernel<<>>( + input_data, output_data, random_data, temperature, size); + } +}; + +} // namespace phi +#endif + +PD_REGISTER_KERNEL( + gumbel_softmax, GPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {} diff --git a/paddle/phi/kernels/gumbel_softmax_grad_kernel.h b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h new file mode 100644 index 0000000000000..e3f02d90fcb6a --- /dev/null +++ b/paddle/phi/kernels/gumbel_softmax_grad_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void GumbelSoftmaxGradKernel(const Context& dev_ctx, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx); + +} // namespace phi diff --git a/paddle/phi/kernels/gumbel_softmax_kernel.h b/paddle/phi/kernels/gumbel_softmax_kernel.h new file mode 100644 index 0000000000000..46edb9750dd34 --- /dev/null +++ b/paddle/phi/kernels/gumbel_softmax_kernel.h @@ -0,0 +1,28 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/core/dense_tensor.h" +namespace phi { + +template +void GumbelSoftmaxKernel(const Context& dev_ctx, + const DenseTensor& x, + float temperature, + bool hard, + int axis, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h new file mode 100644 index 0000000000000..3d57dd1002ac8 --- /dev/null +++ b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h @@ -0,0 +1,50 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" + +namespace phi { + +template +void GumbelSoftmaxGradKernel(const Context& ctx, + const DenseTensor& out, + const DenseTensor& dout, + int axis, + DenseTensor* dx) { + const int rank = dx->dims().size(); + axis = funcs::CanonicalAxis(axis, rank); + int axis_dim = dx->dims()[axis]; + // allocate memory on device. + + ctx.template Alloc(dx); + if (dx->numel() == 0) { + return; + } + + const int size_to_axis = funcs::SizeToAxis(axis, dx->dims()); + const int size_from_axis = funcs::SizeFromAxis(axis, dx->dims()); + DenseTensor dx_2d(*dx), out_2d(out), dout_2d(dout); + dx_2d.Resize({size_to_axis, size_from_axis}); + out_2d.Resize({size_to_axis, size_from_axis}); + dout_2d.Resize({size_to_axis, size_from_axis}); + paddle::operators::math::SoftmaxGradFunctor()( + ctx, axis_dim, &out_2d, &dout_2d, &dx_2d); +} + +} // namespace phi diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h new file mode 100644 index 0000000000000..2517d84898727 --- /dev/null +++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h @@ -0,0 +1,96 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/kernels/funcs/axis_utils.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +namespace phi { + +template +struct ArgMaxFunctor { + void operator()(const Context& ctx, + const DenseTensor& in, + DenseTensor* index_tensor, + const int64_t& axis) { + auto in_eigen = EigenTensor::From(in, in.dims()); + auto index_eigen = EigenTensor::From(*index_tensor); + index_eigen = in_eigen.argmax(axis).template cast(); + } +}; + +template +struct GumbleNoiseGenerator; + +template +struct OneHotGenerator; + +template +void GumbelSoftmaxKernel(const Context& ctx, + const DenseTensor& x, + float temperature, + bool hard, + int axis, + DenseTensor* out) { + const int rank = x.dims().size(); + axis = funcs::CanonicalAxis(axis, rank); + int axis_dim = x.dims()[axis]; + + PADDLE_ENFORCE_GT(temperature, + 0, + phi::errors::InvalidArgument( + "The temperature must be greater than 0. But " + "received temperature = %f", + temperature)); + + // allocate memory on device. + ctx.template Alloc(out); + if (out->numel() == 0) { + return; + } + + const int size_to_axis = funcs::SizeToAxis(axis, x.dims()); + const int size_from_axis = funcs::SizeFromAxis(axis, x.dims()); + DenseTensor x_noise_2d, out_2d(*out); + x_noise_2d.Resize({size_to_axis, size_from_axis}); + out_2d.Resize({size_to_axis, size_from_axis}); + + // generate gumbel noise and add it to X + auto* x_noise_data = ctx.template Alloc(&x_noise_2d); + GumbleNoiseGenerator::Transform(ctx, + x.data(), + x_noise_data, + size_to_axis, + size_from_axis, + temperature); + +#ifdef PADDLE_ON_INFERENCE + paddle::operators::math::SoftmaxFunctor()( + ctx, axis_dim, &x_noise_2d, &out_2d); +#else + paddle::operators::math::SoftmaxFunctor()( + ctx, axis_dim, &x_noise_2d, &out_2d); +#endif + + if (hard) { + OneHotGenerator::Transform(ctx, x, out, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/gumbel_softmax_sig.cc b/paddle/phi/ops/compat/gumbel_softmax_sig.cc new file mode 100644 index 0000000000000..c7585a4e5f39a --- /dev/null +++ b/paddle/phi/ops/compat/gumbel_softmax_sig.cc @@ -0,0 +1,30 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature GumbelSoftmaxGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("gumbel_softmax_grad", + {"Out", GradVarName("Out")}, + {"axis"}, + {GradVarName("X")}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad, + phi::GumbelSoftmaxGradOpArgumentMapping); From 9a7b9eda579ec673204af5ba350d0d5b5104c28c Mon Sep 17 00:00:00 2001 From: zyfncg Date: Sat, 26 Feb 2022 20:09:38 +0800 Subject: [PATCH 70/85] [Pten] Refactor the copy kernel (#39731) * remove SetAllocationForOutputTenosr * add place param for copy kernel * recover SetAllocationForOutputTenosr * polish code * fix empty_dev api bug * test=allcases * test=allcases * fix bug * recover empty * recover modify --- paddle/phi/api/lib/api_custom_impl.cc | 40 +++++----- paddle/phi/core/kernel_utils.h | 1 + paddle/phi/kernels/copy_kernel.h | 1 + paddle/phi/kernels/cpu/copy_kernel.cc | 1 + paddle/phi/kernels/flatten_grad_kernel.cc | 4 +- paddle/phi/kernels/flatten_kernel.cc | 4 +- paddle/phi/kernels/gpu/copy_kernel.cu | 75 +++---------------- paddle/phi/kernels/gpu/elementwise.h | 10 +-- paddle/phi/kernels/gpu/full_kernel.cu | 4 +- .../impl/elementwise_grad_kernel_impl.h | 4 +- .../kernels/impl/expand_grad_kernel_impl.h | 2 +- paddle/phi/kernels/impl/size_kernel_impl.h | 2 +- paddle/phi/kernels/reshape_grad_kernel.cc | 2 +- paddle/phi/kernels/reshape_kernel.cc | 2 +- paddle/phi/kernels/xpu/copy_kernel.cc | 13 +++- paddle/phi/tests/api/test_matmul_api.cc | 6 +- paddle/phi/tests/kernels/test_copy_dev_api.cc | 3 +- .../tests/kernels/test_creation_dev_api.cc | 2 +- .../kernels/test_sparse_utils_dev_api.cc | 41 +++++----- 19 files changed, 86 insertions(+), 131 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 66dba2cc2e1b0..67b743016707a 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -32,39 +32,33 @@ namespace paddle { namespace experimental { Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { - // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( "copy", kernel_key); - VLOG(0) << "to API kernel key: " << kernel_key; - VLOG(0) << "to API kernel: " << kernel; + VLOG(6) << "copy API kernel key: " << kernel_key; + VLOG(6) << "copy API kernel: " << kernel; - // 2. Get Device Context auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); - auto kernel_context = phi::KernelContext(dev_ctx); - - // 3. Auto data transform - auto dense_x = std::dynamic_pointer_cast(x.impl()); - kernel_context.EmplaceBackInput(dense_x.get()); - kernel_context.EmplaceBackAttr(blocking); - - // 4. Prepare outputs & InferMeta - auto dense_out = std::make_shared( - phi::make_intrusive( - phi::TransToPtenPlace(backend)), - phi::DenseTensorMeta()); - phi::MetaTensor meta_out(dense_out.get()); - phi::UnchangedInferMeta(*dense_x, &meta_out); - dense_out->mutable_data(phi::TransToPtenPlace(backend)); - kernel_context.EmplaceBackOutput(dense_out.get()); + + auto dense_x = TensorToDenseTensor(x); + Tensor out; - out.set_impl(dense_out); + auto kernel_out = SetKernelOutput(kernel_key.backend(), &out); + phi::MetaTensor meta_out(kernel_out); + phi::UnchangedInferMeta(*dense_x, &meta_out); - // 5. Call kernel - kernel(&kernel_context); + using kernel_signature = void (*)(const platform::DeviceContext&, + const phi::DenseTensor&, + phi::Place, + bool, + phi::DenseTensor*); + + auto* kernel_fn = kernel.GetVariadicKernelFn(); + (*kernel_fn)( + *dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out); return out; } diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h index 862f61b20400e..2fda3cb6db4fd 100644 --- a/paddle/phi/core/kernel_utils.h +++ b/paddle/phi/core/kernel_utils.h @@ -245,6 +245,7 @@ struct KernelImpl { PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); + PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector&); diff --git a/paddle/phi/kernels/copy_kernel.h b/paddle/phi/kernels/copy_kernel.h index a3ba6eabcdd69..95df29f7e653a 100644 --- a/paddle/phi/kernels/copy_kernel.h +++ b/paddle/phi/kernels/copy_kernel.h @@ -22,6 +22,7 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst); } // namespace phi diff --git a/paddle/phi/kernels/cpu/copy_kernel.cc b/paddle/phi/kernels/cpu/copy_kernel.cc index 7dcd75d39e4df..1af071f23ddc5 100644 --- a/paddle/phi/kernels/cpu/copy_kernel.cc +++ b/paddle/phi/kernels/cpu/copy_kernel.cc @@ -28,6 +28,7 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); diff --git a/paddle/phi/kernels/flatten_grad_kernel.cc b/paddle/phi/kernels/flatten_grad_kernel.cc index 7e8010a43f3d1..f6ba2725004fe 100644 --- a/paddle/phi/kernels/flatten_grad_kernel.cc +++ b/paddle/phi/kernels/flatten_grad_kernel.cc @@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx, DenseTensor* x_grad) { auto xshape_dims = xshape.dims(); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); - phi::Copy(dev_ctx, out_grad, false, x_grad); - x_grad->ResizeAndAllocate(x_dims); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); + x_grad->Resize(x_dims); } } // namespace phi diff --git a/paddle/phi/kernels/flatten_kernel.cc b/paddle/phi/kernels/flatten_kernel.cc index 12eaab92d5211..78ac9eaa785cd 100644 --- a/paddle/phi/kernels/flatten_kernel.cc +++ b/paddle/phi/kernels/flatten_kernel.cc @@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx, int stop_axis, DenseTensor* out) { auto out_dims = out->dims(); - phi::Copy(dev_ctx, x, false, out); - out->ResizeAndAllocate(out_dims); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); + out->Resize(out_dims); } // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate diff --git a/paddle/phi/kernels/gpu/copy_kernel.cu b/paddle/phi/kernels/gpu/copy_kernel.cu index 0cbf5525d60f5..4545f9ce436ea 100644 --- a/paddle/phi/kernels/gpu/copy_kernel.cu +++ b/paddle/phi/kernels/gpu/copy_kernel.cu @@ -28,11 +28,11 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); const auto& src_place = src.place(); - auto dst_place = dst->place(); if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) { PADDLE_THROW(phi::errors::InvalidArgument( @@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->ResizeAndAllocate(src.dims()); - auto* dst_ptr = dst->mutable_data(dst_place); + dst->Resize(src.dims()); + + void* dst_ptr = nullptr; + if (paddle::platform::is_cpu_place(dst_place)) { + dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } else { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + } if (src_ptr == dst_ptr && src_place == dst_place) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " @@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx, auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); - if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_cpu_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_cpu_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); - } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT - paddle::platform::is_cpu_place(dst_place)) { + if (paddle::platform::is_gpu_place(src_place) && // NOLINT + paddle::platform::is_cpu_place(dst_place)) { auto src_gpu_place = src_place; auto dst_cpu_place = dst_place; auto ctx_place = dev_ctx.GetPlace(); @@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx, : reinterpret_cast(dev_ctx).stream(); paddle::memory::Copy( dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); - } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT - paddle::platform::is_cuda_pinned_place(dst_place)) { - auto src_gpu_place = src_place; - auto dst_cuda_pinned_place = dst_place; - auto ctx_place = dev_ctx.GetPlace(); - PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), - true, - phi::errors::PreconditionNotMet( - "Device context place mismatch. When copying Tensor " - "data from GPU memory to CUDA Pinned memory, current " - "device context place should be GPU.")); - auto ctx_gpu_place = ctx_place; - PADDLE_ENFORCE_EQ(src_gpu_place, - ctx_gpu_place, - phi::errors::PreconditionNotMet( - "The source GPU device and current device context do " - "not match. The source GPU device number is %d, but " - "device context GPU number is %d.", - src_gpu_place.device, - ctx_gpu_place.device)); - auto stream = - blocking ? nullptr - : reinterpret_cast(dev_ctx).stream(); - paddle::memory::Copy( - dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream); - } else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT - paddle::platform::is_gpu_place(dst_place)) { - auto src_cuda_pinned_place = src_place; - auto dst_gpu_place = dst_place; - auto ctx_place = dev_ctx.GetPlace(); - PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place), - true, - phi::errors::PreconditionNotMet( - "Device context place mismatch. When copying Tensor " - "data from CUDA Pinned memory to GPU memory, current " - "device context place should be GPU.")); - auto ctx_gpu_place = ctx_place; - PADDLE_ENFORCE_EQ(dst_gpu_place, - ctx_gpu_place, - phi::errors::PreconditionNotMet( - "The target GPU device and current device context do " - "not match. The target GPU device number is %d, but " - "device context GPU number is %d.", - dst_gpu_place.device, - ctx_gpu_place.device)); - auto stream = - blocking ? nullptr - : reinterpret_cast(dev_ctx).stream(); - paddle::memory::Copy( - dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream); } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT paddle::platform::is_gpu_place(dst_place)) { auto src_gpu_place = src_place; diff --git a/paddle/phi/kernels/gpu/elementwise.h b/paddle/phi/kernels/gpu/elementwise.h index 369bd8d8ad418..12cafc7023bb5 100644 --- a/paddle/phi/kernels/gpu/elementwise.h +++ b/paddle/phi/kernels/gpu/elementwise.h @@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, auto *dx_data = dx->mutable_data(ctx.GetPlace()); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } } else { // For inplace strategy, dx will be stored in addr of dout, which makes @@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, auto *dy_data = dy->mutable_data(ctx.GetPlace()); if (dy->dims() == dout.dims()) { if (dy_data != dout_data) { - phi::Copy(ctx, dout, false, dy); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); } } else { std::vector reduce_dims = @@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx, if (dx_data == dout_data && dy_data != dout_data) { VLOG(4) << "Special case when dx_data is the same as dout_data, " "only need copy dout to dy"; - phi::Copy(ctx, dout, false, dy); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dy); } else if (dx_data != dout_data && dy_data == dout_data) { VLOG(4) << "Special case when dy_data is the same as dout_data, " "only need copy dout to dx"; - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } else if (dx_data != dout_data && dy_data != dout_data) { auto size = x.numel(); int vec_size = max(static_cast(sizeof(float4) / sizeof(T)), 1); @@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, auto *dx_data = dx->mutable_data(ctx.GetPlace()); if (dx->dims() == dout.dims()) { if (dx_data != dout_data) { - phi::Copy(ctx, dout, false, dx); + phi::Copy(ctx, dout, ctx.GetPlace(), false, dx); } } else { // For inplace strategy, dx will be stored in addr of dout, which makes diff --git a/paddle/phi/kernels/gpu/full_kernel.cu b/paddle/phi/kernels/gpu/full_kernel.cu index 48b26540331ef..1f756bfdbed30 100644 --- a/paddle/phi/kernels/gpu/full_kernel.cu +++ b/paddle/phi/kernels/gpu/full_kernel.cu @@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx, DenseTensor* out) { out->Resize(phi::make_ddim(shape.GetData())); int numel = out->numel(); - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); if (numel > 0) { // in transformer model the numel of outpout will be zero. std::vector inputs = {}; @@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx, static_cast(value))); std::vector inputs = {}; std::vector outputs = {out}; - out->mutable_data(dev_ctx.GetPlace()); + dev_ctx.template Alloc(out); // This function has no input, so the inputs.size() == 0. Use kUnary, but the // data will not be loaded in the kernel because the number of parameters in // the operator is 0 diff --git a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h index cafcb302d65b9..460e74b58166a 100644 --- a/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/elementwise_grad_kernel_impl.h @@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx, x_grad->dims() == out_grad.dims()) { VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't " "reduce"; - phi::Copy(dev_ctx, out_grad, false, x_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); } else if (x_grad == nullptr && y_grad != nullptr && y_grad->dims() == out_grad.dims()) { VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't " "reduce"; - phi::Copy(dev_ctx, out_grad, false, y_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad); } else { grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis); } diff --git a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h index 889b560dd7398..766f91cd22e1f 100644 --- a/paddle/phi/kernels/impl/expand_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/expand_grad_kernel_impl.h @@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx, } // no need reduce, just copy if (just_copy) { - phi::Copy(ctx, out_grad, false, in_grad); + phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad); } else { PADDLE_ENFORCE_GE(dims, 1, diff --git a/paddle/phi/kernels/impl/size_kernel_impl.h b/paddle/phi/kernels/impl/size_kernel_impl.h index 9a873871d75fd..7b781dba3ad23 100644 --- a/paddle/phi/kernels/impl/size_kernel_impl.h +++ b/paddle/phi/kernels/impl/size_kernel_impl.h @@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx, cpu_tensor.Resize(out->dims()); auto cpu_data = ctx.template HostAlloc(&cpu_tensor); cpu_data[0] = input.numel(); - phi::Copy(ctx, cpu_tensor, false, out); + phi::Copy(ctx, cpu_tensor, place, false, out); } } diff --git a/paddle/phi/kernels/reshape_grad_kernel.cc b/paddle/phi/kernels/reshape_grad_kernel.cc index 5361315bb611b..38132966407dc 100644 --- a/paddle/phi/kernels/reshape_grad_kernel.cc +++ b/paddle/phi/kernels/reshape_grad_kernel.cc @@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx, const DenseTensor& out_grad, DenseTensor* x_grad) { auto x_dims = x_grad->dims(); - phi::Copy(dev_ctx, out_grad, false, x_grad); + phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad); x_grad->Resize(x_dims); } diff --git a/paddle/phi/kernels/reshape_kernel.cc b/paddle/phi/kernels/reshape_kernel.cc index 570e70ea11227..f758d7c70518f 100644 --- a/paddle/phi/kernels/reshape_kernel.cc +++ b/paddle/phi/kernels/reshape_kernel.cc @@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx, // TODO(chenweihang): the output dims are overwrite after copying, // here we need to use copy method that only copy data auto dims = out->dims(); - phi::Copy(dev_ctx, x, false, out); + phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out); out->Resize(dims); out->ResetLoD(x.lod()); } diff --git a/paddle/phi/kernels/xpu/copy_kernel.cc b/paddle/phi/kernels/xpu/copy_kernel.cc index 58efbafc88bee..fb931ef18a856 100644 --- a/paddle/phi/kernels/xpu/copy_kernel.cc +++ b/paddle/phi/kernels/xpu/copy_kernel.cc @@ -27,12 +27,19 @@ namespace phi { template void Copy(const Context& dev_ctx, const DenseTensor& src, + Place dst_place, bool blocking, DenseTensor* dst) { auto* src_ptr = src.data(); - auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + void* dst_ptr = nullptr; + + dst->Resize(src.dims()); + if (paddle::platform::is_cpu_place(dst_place)) { + dst_ptr = dev_ctx.HostAlloc(dst, src.dtype()); + } else { + dst_ptr = dev_ctx.Alloc(dst, src.dtype()); + } const auto& src_place = src.place(); - const auto& dst_place = dst->place(); if (src_ptr == dst_ptr && src_place == dst_place) { VLOG(3) << "Skip copy the same data async from " << src_place << " to " @@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx, VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " << dst_place; - dst->ResizeAndAllocate(src.dims()); + CHECK(dst->layout() == src.layout()); auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc index fd8a127b7c77d..e5fc9c5b1f64b 100644 --- a/paddle/phi/tests/api/test_matmul_api.cc +++ b/paddle/phi/tests/api/test_matmul_api.cc @@ -127,8 +127,8 @@ TEST(API, matmul_cuda) { auto place = paddle::platform::CUDAPlace(); auto* dev_ctx = static_cast(pool.GetByPlace(place)); - phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get()); - phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get()); + phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get()); + phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get()); paddle::experimental::Tensor x(dense_x); paddle::experimental::Tensor y(dense_y); @@ -152,7 +152,7 @@ TEST(API, matmul_cuda) { phi::DenseTensorMeta( phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW)); - phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get()); + phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get()); for (size_t i = 0; i < 9; i++) { ASSERT_NEAR(sum[i], ref_out->data()[i], 1e-6f); diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc index 4cd283d925ab4..d69c7b2174f72 100644 --- a/paddle/phi/tests/kernels/test_copy_dev_api.cc +++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc @@ -62,7 +62,8 @@ TEST(DEV_API, copy) { .GetAllocator(paddle::platform::CPUPlace()) .get()); dev_ctx.Init(); - phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); + phi::Copy( + dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get()); // 3. check result for (int64_t i = 0; i < dense_src->numel(); i++) { diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc index 8e825b7790111..e4f80a5bd19eb 100644 --- a/paddle/phi/tests/kernels/test_creation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc @@ -39,7 +39,7 @@ TEST(DEV_API, empty) { dev_ctx.Init(); // 2. test API - auto out = phi::Empty(dev_ctx, {3, 2}, phi::DataType::INT32); + auto out = phi::Empty(dev_ctx, {3, 2}, phi::DataType::INT32); // 3. check result ASSERT_EQ(out.dims().size(), 2); diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc index a75ca633b05a8..15c00d385eda9 100644 --- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc +++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc @@ -53,8 +53,8 @@ inline void CheckResult( DenseTensorMeta(real_elements.dtype(), real_elements.dims(), real_elements.layout())); - phi::Copy(*dev_ctx_gpu, real_indices, true, &indices); - phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); + phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices); + phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements); int cmp_indices = memcmp(indices.data(), non_zero_indices.data(), @@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, cuda_alloc.get(), DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout())); - phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); + phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x); auto sparse_out = sparse::DenseToSparseCoo(dev_ctx_gpu, d_dense_x, sparse_dim); CheckResult(&dev_ctx_gpu, @@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims, phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, crows, true, &d_crows); - phi::Copy(dev_ctx_gpu, cols, true, &d_cols); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows); + phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols); + phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCsrToCoo(dev_ctx_gpu, d_csr); CheckResult(&dev_ctx_gpu, @@ -406,9 +406,9 @@ inline void CheckCsrResult( DenseTensorMeta(real_elements.dtype(), real_elements.dims(), real_elements.layout())); - phi::Copy(*dev_ctx_gpu, real_crows, true, &crows); - phi::Copy(*dev_ctx_gpu, real_cols, true, &cols); - phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); + phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows); + phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols); + phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements); int cmp_crows = memcmp(crows.data(), non_zero_crows.data(), @@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims, dev_ctx_gpu.PartialInitWithAllocator(); phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, indices, true, &d_indices); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices); + phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values); phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCooToCsr(dev_ctx_gpu, d_coo); CheckCsrResult(&dev_ctx_gpu, @@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, .GetAllocator(phi::CPUPlace()) .get()); dev_ctx_gpu.PartialInitWithAllocator(); - phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); + phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x); auto sparse_out = sparse::DenseToSparseCsr(dev_ctx_gpu, d_dense_x); CheckCsrResult(&dev_ctx_gpu, @@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims, dev_ctx_gpu.PartialInitWithAllocator(); DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta()); DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta()); - phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices); - phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements); + phi::Copy( + dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices); + phi::Copy( + dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements); SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims); auto dense_out_cuda = sparse::SparseCooToDense(dev_ctx_gpu, coo_cuda); @@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims, DenseTensorMeta(dense_out_cuda.dtype(), dense_out_cuda.dims(), dense_out_cuda.layout())); - phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out); + phi::Copy( + dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out); int cmp_cuda = memcmp( &dense_data[0], h_dense_out.data(), sizeof(T) * dense_data.size()); ASSERT_EQ(cmp_cuda, 0); @@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims, phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta); - phi::Copy(dev_ctx_gpu, crows, true, &d_crows); - phi::Copy(dev_ctx_gpu, cols, true, &d_cols); - phi::Copy(dev_ctx_gpu, values, true, &d_values); + phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows); + phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols); + phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); auto cuda_sparse_out = sparse::SparseCsrToDense(dev_ctx_gpu, d_csr); phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta()); - phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out); + phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out); int cmp_cuda = memcmp(h_out.data(), dense_data.data(), sizeof(T) * dense_data.size()); ASSERT_EQ(cmp_cuda, 0); From b33a3c232c6161b2e1f7e4c4c78012e2715c9047 Mon Sep 17 00:00:00 2001 From: YuanRisheng Date: Sat, 26 Feb 2022 22:38:29 +0800 Subject: [PATCH 71/85] revert reshape op infershape (#39946) --- paddle/fluid/operators/reshape_op.cc | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index ddb598f575f67..0e74a23523b7d 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -476,6 +476,21 @@ class Reshape2Op : public ReshapeOp { const framework::VariableNameMap &outputs, const framework::AttributeMap &attrs) : ReshapeOp(type, inputs, outputs, attrs) {} + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasOutput("XShape"), true, + platform::errors::InvalidArgument( + "Output(XShape) of ReshapeOp should not be null.")); + const auto &x_dims = ctx->GetInputDim("X"); + std::vector xshape_dims(x_dims.size() + 1); + xshape_dims[0] = 0; + for (int i = 0; i < x_dims.size(); ++i) { + xshape_dims[i + 1] = x_dims[i]; + } + ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims)); + ctx->ShareLoD("X", /*->*/ "XShape"); + + ReshapeOp::InferShape(ctx); + } }; class Reshape2OpMaker : public ReshapeOpMaker { @@ -636,13 +651,10 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, ops::ReshapeGradKernel, int64_t, ops::ReshapeGradKernel); -DELCARE_INFER_SHAPE_FUNCTOR(reshape2, ReshapeInferShapeFunctor, - PT_INFER_META(phi::ReshapeWithXShapeInferMeta)); - REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, ops::Reshape2GradMaker, ops::Reshape2GradMaker, - ReshapeInferShapeFunctor, ops::ReshapeOpInplaceInferer); + ops::ReshapeOpInplaceInferer); REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp, ops::Reshape2DoubleGradMaker, ops::Reshape2DoubleGradMaker, From 282e09dcfd604f356fd3c8a63eae7d66c58dc015 Mon Sep 17 00:00:00 2001 From: Leo Chen Date: Sun, 27 Feb 2022 11:59:27 +0800 Subject: [PATCH 72/85] fix pylayer problem with amp (#39950) * fix pylayer problem with amp * add ut * refine code --- python/paddle/autograd/py_layer.py | 10 +++++++ python/paddle/fluid/dygraph/amp/auto_cast.py | 13 +++++++++ .../test_imperative_auto_mixed_precision.py | 27 +++++++++++++++++++ 3 files changed, 50 insertions(+) diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py index 5a22d22151a1c..26740dfd0f6db 100644 --- a/python/paddle/autograd/py_layer.py +++ b/python/paddle/autograd/py_layer.py @@ -14,6 +14,8 @@ import paddle from paddle.fluid.framework import dygraph_only +from paddle.fluid.dygraph.amp.auto_cast import amp_state +from paddle.amp.auto_cast import auto_cast from paddle.fluid import core __all__ = [] @@ -46,6 +48,7 @@ def backward(ctx, dy): def __init__(self): self.container = None + self._amp_state = amp_state() def save_for_backward(self, *tensors): """ @@ -178,6 +181,13 @@ class PyLayerBackward(PyLayerContext): def backward(self, *args, **kwargs): with paddle.fluid.dygraph.guard(): with paddle.fluid.dygraph.no_grad(): + if self._amp_state and 'enable' in self._amp_state and self._amp_state[ + 'enable']: + with auto_cast(**args[0]._amp_state): + return self._forward_cls.backward(*args, **kwargs) + else: + + return self._forward_cls.backward(*args, **kwargs) return self._forward_cls.backward(*args, **kwargs) diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 41a7d3d774793..8230e4bbd7774 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -78,6 +78,13 @@ BF16_WHITE_LIST = {'conv2d'} BF16_BLACK_LIST = {' '} +_g_amp_state_ = None + + +def amp_state(): + global _g_amp_state_ + return _g_amp_state_ + #NOTE(zhiqiu): similar as paddle.fluid.contrib.mixed_precision.fp16_lists.AutoMixedPrecisionLists._update_list # The reason why not use AutoMixedPrecisionLists is that custom_black_varnames is not suitable for imperative mode. @@ -240,6 +247,11 @@ def amp_guard(enable=True, print(conv.dtype) # FP32 """ + amp_state = locals() + global _g_amp_state_ + original_state = _g_amp_state_ + _g_amp_state_ = amp_state + # check amp_level: O0-O2 level = level.upper() if not (level in ['O0', 'O1', 'O2']): @@ -349,6 +361,7 @@ def amp_guard(enable=True, yield finally: if tracer: + _g_amp_state_ = original_state tracer._amp_level = original_amp_level tracer._set_amp_op_list(original_white_list, original_black_list) # set_flags(original_flags) diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 0043a7f78b4b3..67c4bb3b2c746 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -20,6 +20,7 @@ from test_imperative_resnet import ResNet, BottleneckBlock, ConvBNLayer, train_parameters, optimizer_setting import paddle.nn as nn from paddle.static import InputSpec +from paddle.autograd import PyLayer if fluid.core.is_compiled_with_cuda(): fluid.set_flags({"FLAGS_cudnn_deterministic": True}) @@ -1146,5 +1147,31 @@ def test_bf16(self): self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-1)) +class TestPyLayerWithAmp(unittest.TestCase): + def test_pylayer(self): + class MyMM(PyLayer): + @staticmethod + def forward(ctx, a, b): + ctx.save_for_backward(a, b) + return a.mm(b) + + @staticmethod + def backward(ctx, grad): + a, b = ctx.saved_tensor() + # NOTE(zhiqiu): a and b is float32 now, while grad is fp16 when forward runs with auto_cast() + # thus, the mm operation raise errors because of the dtype of inputs are inconsistent + return grad.mm(b.t()), a.t().mm(grad) + + x = paddle.rand([10, 10]) + y = paddle.rand([10, 10]) + x.stop_gradient = False + y.stop_gradient = False + + with paddle.amp.auto_cast(): + res = MyMM.apply(x, y) + loss = paddle.mean(res) + loss.backward() + + if __name__ == '__main__': unittest.main() From 2753c16fe19b018c22d2853bf614ebbb2abb6fb7 Mon Sep 17 00:00:00 2001 From: Aurelius84 Date: Mon, 28 Feb 2022 10:01:37 +0800 Subject: [PATCH 73/85] [Phi] Add ClearHolder when re-alloc on new place in DeviceContext (#39833) * [Phi] Add ClearHolder when re-alloc on new place in DeviceContext * fix hostAlloc * foix inferRT unittest * remove dev_ctx ptr --- paddle/phi/core/CMakeLists.txt | 2 +- paddle/phi/core/dense_tensor.cc | 6 ++--- paddle/phi/core/device_context.cc | 38 ++++++++++++++++++++++++++----- 3 files changed, 36 insertions(+), 10 deletions(-) diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt index d3c206c99dc22..f4f57a0acbbb3 100644 --- a/paddle/phi/core/CMakeLists.txt +++ b/paddle/phi/core/CMakeLists.txt @@ -16,13 +16,13 @@ cc_library(tensor_base SRCS tensor_base.cc allocator.cc DEPS phi_enforce) cc_library(tensor_meta SRCS tensor_meta.cc DEPS phi_enforce) cc_library(lod_utils SRCS lod_utils.cc DEPS phi_enforce) -cc_library(phi_device_context SRCS device_context.cc DEPS tensor_base) cc_library(dense_tensor SRCS dense_tensor.cc dense_tensor_impl.cc DEPS fluid_convert_utils tensor_meta tensor_base) cc_library(sparse_coo_tensor SRCS sparse_coo_tensor.cc DEPS tensor_meta tensor_base) cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_base) cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor) cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor) +cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows) cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy) cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils) diff --git a/paddle/phi/core/dense_tensor.cc b/paddle/phi/core/dense_tensor.cc index 44cb63e2b874b..7a0f50533360d 100644 --- a/paddle/phi/core/dense_tensor.cc +++ b/paddle/phi/core/dense_tensor.cc @@ -94,9 +94,9 @@ void* DenseTensor::AllocateFrom(Allocator* allocator, bytes)); bytes = requested_size; } - // TODO(paddle-dev): In case of the allocator of storage_ is different with - // the incoming allocator, we should re-alloc data using the incoming - // allocator. + // NOTE(paddle-dev): In case of the allocator of storage_ is different with + // the incoming allocator, we will re-alloc data using the incoming + // allocator. See DeviceContext.Alloc in core/device_context.cc. if (!holder_ || holder_->size() < bytes + meta_.offset) { meta_.offset = 0; VLOG(10) << "Allocate data with bytes: " << bytes; diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 9c1d85251f892..b139eb99dd484 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -13,8 +13,9 @@ // limitations under the License. #include "paddle/phi/core/device_context.h" +#include "paddle/phi/core/dense_tensor.h" #include "paddle/phi/core/enforce.h" -#include "paddle/phi/core/tensor_base.h" +#include "paddle/phi/core/selected_rows.h" namespace phi { using DataType = paddle::experimental::DataType; @@ -72,6 +73,7 @@ struct DeviceContext::Impl { } void* Alloc(TensorBase* tensor, + const Place& place, DataType dtype = DataType::UNDEFINED, size_t requested_size = 0) const { PADDLE_ENFORCE_NOT_NULL( @@ -81,6 +83,12 @@ struct DeviceContext::Impl { if (dtype == DataType::UNDEFINED) { dtype = tensor->dtype(); } + // NOTE(paddle-dev): In case of tensor has already hold allocation and + // is going to allocate allocation on new place, we will clear its holder + // firstly and then re-alloc it. + if (tensor->initialized() && tensor->place() != place) { + ClearHolder(tensor); + } auto* allocator = tensor->numel() == 0 ? zero_allocator_ : device_allocator_; return tensor->AllocateFrom( @@ -88,9 +96,11 @@ struct DeviceContext::Impl { } template - T* Alloc(TensorBase* tensor, size_t requested_size = 0) const { + T* Alloc(TensorBase* tensor, + const Place& place, + size_t requested_size = 0) const { DataType dtype = paddle::experimental::CppTypeToDataType::Type(); - return static_cast(Alloc(tensor, dtype, requested_size)); + return static_cast(Alloc(tensor, place, dtype, requested_size)); } void* HostAlloc(TensorBase* tensor, @@ -103,6 +113,9 @@ struct DeviceContext::Impl { if (dtype == DataType::UNDEFINED) { dtype = tensor->dtype(); } + if (tensor->initialized() && tensor->place() != CPUPlace()) { + ClearHolder(tensor); + } auto* allocator = tensor->numel() == 0 ? zero_allocator_ : host_allocator_; return tensor->AllocateFrom( const_cast(allocator), dtype, requested_size); @@ -147,6 +160,19 @@ struct DeviceContext::Impl { } private: + void ClearHolder(TensorBase* tensor) const { + if (!tensor->initialized()) return; + + if (DenseTensor::classof(tensor)) { + static_cast(tensor)->clear(); + } else if (SelectedRows::classof(tensor)) { + static_cast(tensor)->mutable_value()->clear(); + } else { + PADDLE_THROW(errors::Unimplemented( + "Only support DenseTensor and SelectedRows now.")); + } + } + const Allocator* device_allocator_{nullptr}; const Allocator* host_allocator_{nullptr}; const Allocator* zero_allocator_{nullptr}; @@ -168,7 +194,7 @@ DeviceContext::DeviceContext(DeviceContext&& other) { impl_ = std::move(other.impl_); } -DeviceContext& DeviceContext::operator=(DeviceContext&&) = default; +DeviceContext& DeviceContext::operator=(DeviceContext&& other) = default; DeviceContext::~DeviceContext() = default; @@ -199,12 +225,12 @@ const Allocator& DeviceContext::GetZeroAllocator() const { void* DeviceContext::Alloc(TensorBase* tensor, DataType dtype, size_t requested_size) const { - return impl_->Alloc(tensor, dtype, requested_size); + return impl_->Alloc(tensor, GetPlace(), dtype, requested_size); } template T* DeviceContext::Alloc(TensorBase* tensor, size_t requested_size) const { - return impl_->Alloc(tensor, requested_size); + return impl_->Alloc(tensor, GetPlace(), requested_size); } void* DeviceContext::HostAlloc(TensorBase* tensor, From 0ff72e5d83a4d1861be9a72b9d5f84d098878ad6 Mon Sep 17 00:00:00 2001 From: Liu-xiandong <85323580+Liu-xiandong@users.noreply.github.com> Date: Mon, 28 Feb 2022 10:05:33 +0800 Subject: [PATCH 74/85] [KP] Unify .cu and .xpu files with .kps files (#39917) * [KP] Unify .cu and .xpu files with .kps files * fix CI bug in GPU and modify the list * fix conflict * modify the date --- cmake/operators.cmake | 12 ++ .../elementwise/elementwise_add_op.cu | 29 --- .../elementwise/elementwise_add_op.h | 23 ++- .../elementwise/elementwise_add_op.kps | 171 +++--------------- .../platform/device/xpu/xpu_op_kpfirst_list.h | 5 +- 5 files changed, 59 insertions(+), 181 deletions(-) delete mode 100644 paddle/fluid/operators/elementwise/elementwise_add_op.cu diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 8843dd2628767..7affd59de162d 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -73,6 +73,12 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + # rename in KP: .kps -> .cu + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + endif() if (WITH_NV_JETSON) list(REMOVE_ITEM cu_srcs "decode_jpeg_op.cu") endif() @@ -96,6 +102,12 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND hip_srcs ${TARGET}.cu) endif() + # rename in KP: .kps -> .cu + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.kps) + file(COPY ${TARGET}.kps DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.kps ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + list(APPEND hip_srcs ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}.cu) + endif() if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu) set(PART_CUDA_KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.part.cu ${PART_CUDA_KERNEL_FILES} PARENT_SCOPE) diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu deleted file mode 100644 index 52bf9b0e03f02..0000000000000 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ /dev/null @@ -1,29 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/phi/kernels/gpu/elementwise.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - grad_add, ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel>, - ops::ElementwiseAddKernel>); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 1a256f7567578..ae2e5b33b5f43 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -13,7 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once - +#ifdef __xpu__ +#include +#include +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/elementwise/elementwise_xpu.h" +#include "paddle/fluid/platform/device/device_wrapper.h" +#else #include #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" @@ -21,6 +28,7 @@ limitations under the License. */ // only can include the headers in paddle/phi/include dirs #include "paddle/phi/kernels/elementwise_grad_kernel.h" #include "paddle/phi/kernels/math_kernel.h" +#endif namespace paddle { namespace operators { @@ -28,7 +36,17 @@ namespace operators { template class ElementwiseAddKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext &ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef __xpu__ + std::vector ins; + std::vector outs; + int axis = PackTensorsIntoVector(ctx, &ins, &outs); + const auto& xpu_ctx = + ctx.template device_context(); + paddle::operators::LaunchElementwiseCudaKernel, 1>( + xpu_ctx, ins, &outs, axis, kps::AddFunctor()); +#else auto *x = ctx.Input("X"); auto *y = ctx.Input("Y"); auto *z = ctx.Output("Out"); @@ -40,6 +58,7 @@ class ElementwiseAddKernel : public framework::OpKernel { static_cast::TYPE &>(dev_ctx), *x, *y, axis, z); +#endif } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.kps b/paddle/fluid/operators/elementwise/elementwise_add_op.kps index a3fea0d7b3dbf..d6e0749318e90 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.kps +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.kps @@ -1,14 +1,19 @@ /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifdef PADDLE_WITH_XPU_KP + // Please do not modify the following code #if defined(__CUDA_ARCH__) #undef __CUDA_ARCH__ @@ -26,163 +31,31 @@ limitations under the License. */ #undef __NVCC__ #endif -#ifdef PADDLE_WITH_XPU_KP #include // NOLINT #include "xpu/kernel/cluster_header.h" // NOLINT #include "xpu/kernel/debug.h" // NOLINT #include "xpu/kernel/math.h" // NOLINT -#include -#include #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" -#include "paddle/fluid/operators/elementwise/elementwise_xpu.h" -#include "paddle/fluid/platform/device/device_wrapper.h" - -namespace paddle { -namespace operators { - -template -class ElementwiseAddXPUKPKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - std::vector ins; - std::vector outs; - int axis = PackTensorsIntoVector(ctx, &ins, &outs); - const auto& xpu_ctx = - ctx.template device_context(); - paddle::operators::LaunchElementwiseCudaKernel, 1>( - xpu_ctx, ins, &outs, axis, kps::AddFunctor()); - } -}; - -static std::vector get_rdims(const std::vector& xdims, - const std::vector& ydims) { - std::vector rdims; - for (size_t i = 0; i < xdims.size(); i++) { - if (xdims[i] != ydims[i]) { - rdims.push_back(i); - } - } - return rdims; -} - -template -class ElementwiseAddGradXPUKPKernel : public ElemwiseGradKernel { - using XPUType = typename XPUTypeTrait::Type; - - public: - void Compute(const framework::ExecutionContext& ctx) const override { - ElemwiseGradKernel::Compute(ctx); - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dz = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - const framework::DDim& x_dims = x->dims(); - const framework::DDim& y_dims = y->dims(); - const framework::DDim& dz_dims = dz->dims(); - int axis = ctx.Attr("axis"); - axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); - int max_dim = std::max(x_dims.size(), y_dims.size()); - PADDLE_ENFORCE_GE( - axis, 0, - platform::errors::InvalidArgument( - "Axis should be great than or equal to 0, but received axis is %d.", - axis)); - PADDLE_ENFORCE_LT( - axis, max_dim, - platform::errors::InvalidArgument( - "Axis should be less than %d, but received axis is %d.", max_dim, - axis)); - - std::vector x_dims_vec(max_dim, 1); - std::vector y_dims_vec(max_dim, 1); - std::vector z_dims_vec(max_dim, 1); - if (x_dims.size() == max_dim) { - for (int i = 0; i < max_dim; i++) { - x_dims_vec[i] = x_dims[i]; - } - } else { - for (int i = 0; i < x_dims.size(); i++) { - x_dims_vec[i + axis] = x_dims[i]; - } - } - - if (y_dims.size() == max_dim) { - for (int i = 0; i < max_dim; i++) { - y_dims_vec[i] = y_dims[i]; - } - } else { - for (int i = 0; i < y_dims.size(); i++) { - y_dims_vec[i + axis] = y_dims[i]; - } - } - - for (int i = 0; i < max_dim; i++) { - z_dims_vec[i] = dz_dims[i]; - } - std::vector rdims_for_x; - std::vector rdims_for_y; - rdims_for_x = get_rdims(x_dims_vec, z_dims_vec); - rdims_for_y = get_rdims(y_dims_vec, z_dims_vec); - const T* dz_data = dz->data(); - auto& dev_ctx = - ctx.template device_context(); - - if (dx != nullptr) { - T* dx_data = dx->mutable_data(ctx.GetPlace()); - if (rdims_for_x.size() == 0) { - if (dx_data != dz_data) { - framework::TensorCopy( - *dz, ctx.GetPlace(), - ctx.template device_context(), dx); - } - } else { - // For inplace strategy, dx will be stored in addr of dz, which makes - // the result of dy wrong. - if (dx->IsSharedBufferWith(*dz)) { - dx->clear(); - dx->mutable_data(x->dims(), ctx.GetPlace()); - } - - int ret = xpu::reduce_sum( - dev_ctx.x_context(), reinterpret_cast(dz_data), - reinterpret_cast(dx_data), z_dims_vec, rdims_for_x); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum "); - } - } - - if (dy != nullptr) { - T* dy_data = dy->mutable_data(ctx.GetPlace()); - if (rdims_for_y.size() == 0) { - if (dy_data != dz_data) { - framework::TensorCopy( - *dz, ctx.GetPlace(), - ctx.template device_context(), dy); - } - } else { - int ret = xpu::reduce_sum( - dev_ctx.x_context(), reinterpret_cast(dz_data), - reinterpret_cast(dy_data), z_dims_vec, rdims_for_y); - PADDLE_ENFORCE_XDNN_SUCCESS(ret, "reduce_sum "); - } - } - } -}; - -} // namespace operators -} // namespace paddle +#else +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/phi/kernels/gpu/elementwise.h" +#endif namespace ops = paddle::operators; namespace plat = paddle::platform; +#ifdef PADDLE_WITH_XPU_KP REGISTER_OP_KERNEL(elementwise_add, KP, plat::XPUPlace, - ops::ElementwiseAddXPUKPKernel); - -REGISTER_OP_KERNEL(elementwise_add_grad, KP, plat::XPUPlace, - ops::ElementwiseAddGradXPUKPKernel); - -#endif // PADDLE_WITH_XPU_KP + ops::ElementwiseAddKernel); +#else +REGISTER_OP_CUDA_KERNEL( + grad_add, ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel>, + ops::ElementwiseAddKernel>); +#endif \ No newline at end of file diff --git a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h index aa020593454f8..f79ef8505d878 100644 --- a/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h +++ b/paddle/fluid/platform/device/xpu/xpu_op_kpfirst_list.h @@ -27,7 +27,10 @@ using XPUKernelSet = using XPUOpMap = std::unordered_map; XPUOpMap& get_kp_ops() { - static XPUOpMap s_xpu_kp_kernels{}; + static XPUOpMap s_xpu_kp_kernels{ + {"elementwise_add", + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + }; return s_xpu_kp_kernels; } From d4ae17754cb4d49f06b7e5007fc7890f5316def2 Mon Sep 17 00:00:00 2001 From: chenjian Date: Mon, 28 Feb 2022 10:11:54 +0800 Subject: [PATCH 75/85] add new profiler components (#39964) * add new profiler components * fix bug --- paddle/fluid/framework/operator.cc | 10 +- paddle/fluid/platform/dynload/cupti.h | 4 +- paddle/fluid/platform/profiler/CMakeLists.txt | 9 +- .../platform/profiler/cpu_utilization.cc | 172 ++++++++++++++++++ .../fluid/platform/profiler/cpu_utilization.h | 62 +++++++ paddle/fluid/platform/profiler/extra_info.h | 49 +++++ .../fluid/platform/profiler/output_logger.h | 1 - .../platform/profiler/test_extra_info.cc | 31 ++++ paddle/fluid/platform/profiler/utils.cc | 66 +++++++ paddle/fluid/platform/profiler/utils.h | 9 + paddle/phi/backends/dynload/cupti.h | 4 +- 11 files changed, 406 insertions(+), 11 deletions(-) mode change 100644 => 100755 paddle/fluid/platform/profiler/CMakeLists.txt create mode 100644 paddle/fluid/platform/profiler/cpu_utilization.cc create mode 100644 paddle/fluid/platform/profiler/cpu_utilization.h create mode 100644 paddle/fluid/platform/profiler/extra_info.h create mode 100644 paddle/fluid/platform/profiler/test_extra_info.cc create mode 100644 paddle/fluid/platform/profiler/utils.cc diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 692ebf6f332f1..b7332896818c9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -263,11 +263,11 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { // in order to record different op type cost time // and different op name cost time,we set two event. platform::RecordEvent op_type_record_event( - Type().c_str(), platform::TracerEventType::Operator, 1); - auto op_name = platform::OpName(outputs_, Type()); - platform::RecordEvent op_name_record_event( - op_name, platform::TracerEventType::Operator, 1, - platform::EventRole::kUniqueOp); + Type(), platform::TracerEventType::Operator, 1); + // auto op_name = platform::OpName(outputs_, Type()); + // platform::RecordEvent op_name_record_event( + // op_name, platform::TracerEventType::Operator, 1, + // platform::EventRole::kUniqueOp); RunImpl(scope, place); } diff --git a/paddle/fluid/platform/dynload/cupti.h b/paddle/fluid/platform/dynload/cupti.h index dacfe2bd2e7f5..854e5a7b9f04a 100644 --- a/paddle/fluid/platform/dynload/cupti.h +++ b/paddle/fluid/platform/dynload/cupti.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUPTI #include +#include #include #include // NOLINT @@ -50,7 +51,8 @@ namespace dynload { __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ - __macro(cuptiEnableDomain); + __macro(cuptiEnableDomain); \ + __macro(cudaOccMaxActiveBlocksPerMultiprocessor); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); diff --git a/paddle/fluid/platform/profiler/CMakeLists.txt b/paddle/fluid/platform/profiler/CMakeLists.txt old mode 100644 new mode 100755 index 320e989bd9bb1..5acdfa39569f0 --- a/paddle/fluid/platform/profiler/CMakeLists.txt +++ b/paddle/fluid/platform/profiler/CMakeLists.txt @@ -1,8 +1,11 @@ cc_library(host_tracer SRCS host_tracer.cc DEPS enforce) cc_library(cuda_tracer SRCS cuda_tracer.cc cupti_data_process.cc DEPS workqueue_utils enforce glog) -cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer) cc_library(event_node SRCS event_node.cc DEPS enforce) -cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node) +cc_library(profiler_utils SRCS utils.cc DEPS enforce glog) +cc_library(chrometracinglogger SRCS chrometracing_logger.cc DEPS event_node profiler_utils) cc_test(test_event_node SRCS test_event_node.cc DEPS event_node chrometracinglogger) -cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) add_subdirectory(dump) +cc_library(cpu_utilization SRCS cpu_utilization.cc DEPS cpu_info os_info enforce glog) +cc_test(test_extra_info SRCS test_extra_info.cc DEPS profiler_utils) +cc_library(new_profiler SRCS profiler.cc DEPS host_tracer cuda_tracer profiler_utils cpu_utilization) +cc_test(new_profiler_test SRCS profiler_test.cc DEPS new_profiler event_node) diff --git a/paddle/fluid/platform/profiler/cpu_utilization.cc b/paddle/fluid/platform/profiler/cpu_utilization.cc new file mode 100644 index 0000000000000..672a9a154535a --- /dev/null +++ b/paddle/fluid/platform/profiler/cpu_utilization.cc @@ -0,0 +1,172 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/profiler/cpu_utilization.h" + +namespace paddle { +namespace platform { + +#ifdef _MSC_VER +static uint64_t FileTimeToUint64(FILETIME time) { + uint64_t low_part = time.dwLowDateTime; + uint64_t high_part = time.dwHighDateTime; + uint64_t result = (high_part << 32) | low_part; + return result; +} +#endif + +void CpuUtilization::RecordBeginTimeInfo() { +#if defined(_MSC_VER) + HANDLE process_handle = GetCurrentProcess(); + GetSystemTimeAsFileTime(&start_); + GetSystemTimes(&system_idle_time_start_, &system_kernel_time_start_, + &system_user_time_start_); + GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_, + &process_kernel_time_start_, &process_user_time_start_); + +#elif defined(__linux__) + start_ = times(&process_tms_start_); +#define proc_path_size 1024 + static char proc_stat_path[proc_path_size] = "/proc/stat"; + FILE *stat_file = fopen(proc_stat_path, "r"); + if (stat_file != nullptr) { + char temp_str[200]; + uint64_t temp_lu; + while (true) { + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_start_.tms_utime, &nice_time_start_, + &system_tms_start_.tms_stime, &idle_start_, &iowait_start_, + &irq_start_, &softirq_start_, &steal_start_, &temp_lu, &temp_lu); + if (std::string(temp_str).find("cpu") != 0) { + break; + } + if (retval != 11) { + return; + } + } + fclose(stat_file); + } +#else +#endif +} + +void CpuUtilization::RecordEndTimeInfo() { +#if defined(_MSC_VER) + HANDLE process_handle = GetCurrentProcess(); + GetSystemTimeAsFileTime(&end_); + GetSystemTimes(&system_idle_time_end_, &system_kernel_time_end_, + &system_user_time_end_); + GetProcessTimes(process_handle, &process_creation_time_, &process_exit_time_, + &process_kernel_time_end_, &process_user_time_end_); +#elif defined(__linux__) + end_ = times(&process_tms_end_); +#define proc_path_size 1024 + static char proc_stat_path[proc_path_size] = "/proc/stat"; + FILE *stat_file = fopen(proc_stat_path, "r"); + if (stat_file != nullptr) { + char temp_str[200]; + uint64_t temp_lu; + while (true) { + int retval = fscanf( + stat_file, "%s %" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 + "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64 "%" PRIu64, + temp_str, &system_tms_end_.tms_utime, &nice_time_end_, + &system_tms_end_.tms_stime, &idle_end_, &iowait_end_, &irq_end_, + &softirq_end_, &steal_end_, &temp_lu, &temp_lu); + if (std::string(temp_str).find("cpu") != 0) { + break; + } + if (retval != 11) { + return; + } + } + fclose(stat_file); + } +#else +#endif +} + +float CpuUtilization::GetCpuUtilization() { + float cpu_utilization = 0.0; +#if defined(_MSC_VER) + uint64_t system_user_time_start = FileTimeToUint64(system_user_time_start_); + uint64_t system_user_time_end = FileTimeToUint64(system_user_time_end_); + uint64_t system_kernel_time_start = + FileTimeToUint64(system_kernel_time_start_); + uint64_t system_kernel_time_end = FileTimeToUint64(system_kernel_time_end_); + uint64_t system_idle_time_start = FileTimeToUint64(system_idle_time_start_); + uint64_t system_idle_time_end = FileTimeToUint64(system_idle_time_end_); + float busy_time = (system_kernel_time_end - system_kernel_time_start) + + (system_user_time_end - system_user_time_start); + float idle_time = system_idle_time_end - system_idle_time_start; + cpu_utilization = busy_time / (busy_time + idle_time); + +#elif defined(__linux__) + float busy_time = (system_tms_end_.tms_utime - system_tms_start_.tms_utime) + + (system_tms_end_.tms_stime - system_tms_start_.tms_stime) + + (nice_time_end_ - nice_time_start_) + + (irq_end_ - irq_start_) + (softirq_end_ - softirq_start_) + + (steal_end_ - steal_start_); + float idle_time = (idle_end_ - idle_start_) + (iowait_end_ - iowait_start_); + cpu_utilization = busy_time / (busy_time + idle_time); +#else + LOG(WARNING) + << "Current System is not supported to get system cpu utilization" + << cpu_utilization << std::endl; +#endif + return cpu_utilization; +} + +float CpuUtilization::GetCpuCurProcessUtilization() { + float cpu_process_utilization = 0.0; +#ifdef _MSC_VER + uint64_t process_user_time_start = FileTimeToUint64(process_user_time_start_); + uint64_t process_user_time_end = FileTimeToUint64(process_user_time_end_); + uint64_t process_kernel_time_start = + FileTimeToUint64(process_kernel_time_start_); + uint64_t process_kernel_time_end = FileTimeToUint64(process_kernel_time_end_); + uint64_t start = FileTimeToUint64(start_); + uint64_t end = FileTimeToUint64(end_); + float busy_time = (process_kernel_time_end - process_kernel_time_start) + + (process_user_time_end - process_user_time_start); + cpu_process_utilization = busy_time / (end - start); + LOG(INFO) << "Process Utilization = " << cpu_process_utilization << std::endl; +#elif defined(__linux__) + float busy_time = + (process_tms_end_.tms_utime - process_tms_start_.tms_utime) + + (process_tms_end_.tms_stime - process_tms_start_.tms_stime); + cpu_process_utilization = busy_time / (end_ - start_); +#else + LOG(WARNING) + << "Current System is not supported to get process cpu utilization" + << cpu_process_utilization << std::endl; +#endif + return cpu_process_utilization; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/cpu_utilization.h b/paddle/fluid/platform/profiler/cpu_utilization.h new file mode 100644 index 0000000000000..7b05a6302cdb0 --- /dev/null +++ b/paddle/fluid/platform/profiler/cpu_utilization.h @@ -0,0 +1,62 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "glog/logging.h" +#ifdef _MSC_VER +#include +#else +#include +#include +#endif + +namespace paddle { +namespace platform { + +class CpuUtilization { + public: + CpuUtilization() {} + void RecordBeginTimeInfo(); + void RecordEndTimeInfo(); + float GetCpuUtilization(); + float GetCpuCurProcessUtilization(); + + private: +#ifdef _MSC_VER + FILETIME start_, end_; + FILETIME process_user_time_start_, process_user_time_end_; + FILETIME process_kernel_time_start_, process_kernel_time_end_; + FILETIME system_user_time_start_, system_user_time_end_; + FILETIME system_kernel_time_start_, system_kernel_time_end_; + FILETIME system_idle_time_start_, system_idle_time_end_; + FILETIME process_creation_time_, process_exit_time_; +#else + clock_t start_, end_; + uint64_t idle_start_, idle_end_; + uint64_t iowait_start_, iowait_end_; + uint64_t nice_time_start_, nice_time_end_; + uint64_t irq_start_, irq_end_; + uint64_t softirq_start_, softirq_end_; + uint64_t steal_start_, steal_end_; + struct tms system_tms_start_, system_tms_end_; + struct tms process_tms_start_, process_tms_end_; +#endif +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/extra_info.h b/paddle/fluid/platform/profiler/extra_info.h new file mode 100644 index 0000000000000..04532592ebd30 --- /dev/null +++ b/paddle/fluid/platform/profiler/extra_info.h @@ -0,0 +1,49 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/platform/profiler/utils.h" + +namespace paddle { +namespace platform { + +class ExtraInfo { + public: + ExtraInfo() {} + template + void AddExtraInfo(const std::string& key, const std::string& format, + Args... args); + void Clear() { extra_info_.clear(); } + std::unordered_map GetExtraInfo() { + return extra_info_; + } + + private: + std::unordered_map extra_info_; +}; + +template +void ExtraInfo::AddExtraInfo(const std::string& key, const std::string& format, + Args... args) { + std::string value = string_format(format, args...); + extra_info_[key] = value; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/output_logger.h b/paddle/fluid/platform/profiler/output_logger.h index ff4effad5ecc4..05a68cf2a4a8d 100644 --- a/paddle/fluid/platform/profiler/output_logger.h +++ b/paddle/fluid/platform/profiler/output_logger.h @@ -33,7 +33,6 @@ class BaseLogger { virtual void LogHostTraceEventNode(const HostTraceEventNode&) {} virtual void LogRuntimeTraceEventNode(const CudaRuntimeTraceEventNode&) {} virtual void LogNodeTrees(const NodeTrees&) {} - virtual void LogMetaInfo() {} }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/test_extra_info.cc b/paddle/fluid/platform/profiler/test_extra_info.cc new file mode 100644 index 0000000000000..7274c9de977e9 --- /dev/null +++ b/paddle/fluid/platform/profiler/test_extra_info.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" +#include "paddle/fluid/platform/profiler/extra_info.h" + +using paddle::platform::ExtraInfo; + +TEST(ExtraInfoTest, case0) { + ExtraInfo instance; + instance.AddExtraInfo(std::string("info1"), std::string("%d"), 20); + instance.AddExtraInfo(std::string("info2"), std::string("%s"), "helloworld"); + std::unordered_map map = instance.GetExtraInfo(); + EXPECT_EQ(map["info1"], "20"); + EXPECT_EQ(map["info2"], "helloworld"); + EXPECT_EQ(map.size(), 2u); + instance.Clear(); + map = instance.GetExtraInfo(); + EXPECT_EQ(map.size(), 0u); +} diff --git a/paddle/fluid/platform/profiler/utils.cc b/paddle/fluid/platform/profiler/utils.cc new file mode 100644 index 0000000000000..b43389866c7a8 --- /dev/null +++ b/paddle/fluid/platform/profiler/utils.cc @@ -0,0 +1,66 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. + +licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/profiler/utils.h" + +#include + +#include "glog/logging.h" +#include "paddle/fluid/platform/device/gpu/gpu_info.h" +#include "paddle/fluid/platform/dynload/cupti.h" + +namespace paddle { +namespace platform { +#ifdef PADDLE_WITH_CUPTI +float CalculateEstOccupancy(uint32_t DeviceId, uint16_t RegistersPerThread, + int32_t StaticSharedMemory, + int32_t DynamicSharedMemory, int32_t BlockX, + int32_t BlockY, int32_t BlockZ, float BlocksPerSm) { + float occupancy = 0.0; + std::vector device_ids = GetSelectedDevices(); + if (DeviceId < device_ids.size()) { + const gpuDeviceProp& device_property = GetDeviceProperties(DeviceId); + cudaOccFuncAttributes occFuncAttr; + occFuncAttr.maxThreadsPerBlock = INT_MAX; + occFuncAttr.numRegs = RegistersPerThread; + occFuncAttr.sharedSizeBytes = StaticSharedMemory; + occFuncAttr.partitionedGCConfig = PARTITIONED_GC_OFF; + occFuncAttr.shmemLimitConfig = FUNC_SHMEM_LIMIT_DEFAULT; + occFuncAttr.maxDynamicSharedSizeBytes = 0; + const cudaOccDeviceState occDeviceState = {}; + int blockSize = BlockX * BlockY * BlockZ; + size_t dynamicSmemSize = DynamicSharedMemory; + cudaOccResult occ_result; + cudaOccDeviceProp prop(device_property); + cudaOccError status = cudaOccMaxActiveBlocksPerMultiprocessor( + &occ_result, &prop, &occFuncAttr, &occDeviceState, blockSize, + dynamicSmemSize); + if (status == CUDA_OCC_SUCCESS) { + if (occ_result.activeBlocksPerMultiprocessor < BlocksPerSm) { + BlocksPerSm = occ_result.activeBlocksPerMultiprocessor; + } + occupancy = + BlocksPerSm * blockSize / + static_cast(device_property.maxThreadsPerMultiProcessor); + } else { + LOG(WARNING) << "Failed to calculate estimated occupancy, status = " + << status << std::endl; + } + } + return occupancy; +} +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/profiler/utils.h b/paddle/fluid/platform/profiler/utils.h index 04014b972c3e3..cd56d34384268 100644 --- a/paddle/fluid/platform/profiler/utils.h +++ b/paddle/fluid/platform/profiler/utils.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/os_info.h" namespace paddle { @@ -42,5 +45,11 @@ static std::string GetStringFormatLocalTime() { static int64_t nsToUs(int64_t ns) { return ns / 1000; } +#ifdef PADDLE_WITH_CUPTI +float CalculateEstOccupancy(uint32_t deviceId, uint16_t registersPerThread, + int32_t staticSharedMemory, + int32_t dynamicSharedMemory, int32_t blockX, + int32_t blockY, int32_t blockZ, float blocksPerSm); +#endif } // namespace platform } // namespace paddle diff --git a/paddle/phi/backends/dynload/cupti.h b/paddle/phi/backends/dynload/cupti.h index fe98fa6bd37ef..a526fbfd92639 100644 --- a/paddle/phi/backends/dynload/cupti.h +++ b/paddle/phi/backends/dynload/cupti.h @@ -16,6 +16,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUPTI #include +#include #include #include // NOLINT @@ -63,7 +64,8 @@ extern void *cupti_dso_handle; __macro(cuptiSubscribe); \ __macro(cuptiUnsubscribe); \ __macro(cuptiEnableCallback); \ - __macro(cuptiEnableDomain); + __macro(cuptiEnableDomain); \ + __macro(cudaOccMaxActiveBlocksPerMultiprocessor); CUPTI_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUPTI_WRAP); From bd9b9460a29364b88ef28f13436aa25e05707887 Mon Sep 17 00:00:00 2001 From: zmxdream Date: Mon, 28 Feb 2022 10:12:18 +0800 Subject: [PATCH 76/85] fix ps_gpu_wrapper (#39965) --- paddle/fluid/pybind/ps_gpu_wrapper_py.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc index 629dc2c4037e7..e8c338b3fd188 100644 --- a/paddle/fluid/pybind/ps_gpu_wrapper_py.cc +++ b/paddle/fluid/pybind/ps_gpu_wrapper_py.cc @@ -44,8 +44,6 @@ void BindPSGPUWrapper(py::module* m) { .def("set_slot_offset_vector", &framework::PSGPUWrapper::SetSlotOffsetVector, py::call_guard()) - .def("init_GPU_server", &framework::PSGPUWrapper::InitializeGPUServer, - py::call_guard()) .def("set_date", &framework::PSGPUWrapper::SetDate, py::call_guard()) .def("set_dataset", &framework::PSGPUWrapper::SetDataset, From aceb25e1bf62b5abdf00fffcf13232c413b895e5 Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 28 Feb 2022 10:33:11 +0800 Subject: [PATCH 77/85] [Pten] Support optional param for C++ API (#39760) * fix selected_rows bug in C++ API * add optional for C++ APIO * data transform support optional * remove data transform for optional vector * adjust some format of funtcion * fix empyt bug --- paddle/phi/api/lib/api_custom_impl.cc | 4 +- paddle/phi/api/lib/api_utils.h | 32 ++++ paddle/phi/api/lib/data_transform.cc | 10 ++ paddle/phi/api/lib/data_transform.h | 5 + paddle/phi/api/lib/kernel_dispatch.h | 2 +- paddle/phi/api/lib/sparse_api.cc | 6 +- paddle/phi/infermeta/backward.cc | 17 +++ paddle/phi/infermeta/backward.h | 7 + paddle/phi/kernels/empty_kernel.cc | 3 +- .../kernels/impl/matmul_grad_kernel_impl.h | 1 - paddle/phi/tests/api/scale_api.h | 4 +- paddle/phi/tests/api/test_matmul_api.cc | 27 ++++ paddle/utils/optional.h | 1 + python/paddle/utils/code_gen/api_base.py | 144 ++++++++++++++---- python/paddle/utils/code_gen/api_gen.py | 1 + python/paddle/utils/code_gen/backward.yaml | 20 +-- .../paddle/utils/code_gen/backward_api_gen.py | 7 +- 17 files changed, 240 insertions(+), 51 deletions(-) diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 67b743016707a..89a51dde46312 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -34,7 +34,7 @@ namespace experimental { Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( "copy", kernel_key); @@ -67,7 +67,7 @@ std::vector split_impl(const Tensor& x, const ScalarArray& num_or_sections, const Scalar& axis) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); Backend kernel_backend = kernel_key.backend(); DataLayout kernel_layout = kernel_key.layout(); diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_utils.h index 948e40cd28d58..d44dde3b74dd2 100644 --- a/paddle/phi/api/lib/api_utils.h +++ b/paddle/phi/api/lib/api_utils.h @@ -31,6 +31,14 @@ inline std::shared_ptr TensorToDenseTensor( return std::dynamic_pointer_cast(tensor.impl()); } +inline std::shared_ptr TensorToDenseTensor( + const paddle::optional& tensor) { + if (tensor) { + return std::dynamic_pointer_cast(tensor->impl()); + } + return nullptr; +} + inline std::unique_ptr> TensorToDenseTensor( const std::vector& tensors) { auto pt_tensors = std::make_unique>(); @@ -49,12 +57,28 @@ inline std::shared_ptr TensorToSelectedRows( return std::dynamic_pointer_cast(tensor.impl()); } +inline std::shared_ptr TensorToSelectedRows( + const paddle::optional& tensor) { + if (tensor) { + return std::dynamic_pointer_cast(tensor->impl()); + } + return nullptr; +} + /* ----------------- for infer_meta --------------------- */ inline phi::MetaTensor MakeMetaTensor(const phi::DenseTensor& tensor) { return phi::MetaTensor(tensor); } +inline paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + inline std::vector MakeMetaTensor( const std::vector& tensors) { std::vector meta_tensors; @@ -69,6 +93,14 @@ inline phi::MetaTensor MakeMetaTensor(const phi::SelectedRows& tensor) { return phi::MetaTensor(tensor); } +inline paddle::optional MakeMetaTensor( + const paddle::optional& tensor) { + if (tensor) { + return {phi::MetaTensor(*tensor)}; + } + return {paddle::none}; +} + /* ------------------ for output ----------------------- */ inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 9fd91f398f7f4..2074ddd8a9127 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -199,6 +199,16 @@ std::shared_ptr PrepareData( return std::make_shared(out); } +std::shared_ptr PrepareData( + const paddle::optional& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag) { + if (input) { + return PrepareData(*input, target_args_def, transform_flag); + } + return {nullptr}; +} + std::unique_ptr> PrepareData( const std::vector& inputs, const phi::TensorArgDef& target_args_def, diff --git a/paddle/phi/api/lib/data_transform.h b/paddle/phi/api/lib/data_transform.h index 9942b2f90b03b..8eb1c4a179aed 100644 --- a/paddle/phi/api/lib/data_transform.h +++ b/paddle/phi/api/lib/data_transform.h @@ -66,6 +66,11 @@ std::shared_ptr PrepareData( const phi::TensorArgDef& target_args_def, const TransformFlag& transform_flag); +std::shared_ptr PrepareData( + const paddle::optional& input, + const phi::TensorArgDef& target_args_def, + const TransformFlag& transform_flag); + std::unique_ptr> PrepareData( const std::vector& inputs, const phi::TensorArgDef& target_args_def, diff --git a/paddle/phi/api/lib/kernel_dispatch.h b/paddle/phi/api/lib/kernel_dispatch.h index ad315ededf5d7..9a09bc2183ad7 100644 --- a/paddle/phi/api/lib/kernel_dispatch.h +++ b/paddle/phi/api/lib/kernel_dispatch.h @@ -51,7 +51,7 @@ struct KernelKeySet { DataType dtype{DataType::UNDEFINED}; // TODO(chenweihang): iterate all kernelkey for kernel selection - phi::KernelKey GetHigestPriorityKernelKey() { + phi::KernelKey GetHighestPriorityKernelKey() { return phi::KernelKey(static_cast(64 - detail::CountLeadingZeros( backend_set.bitset())), layout, diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc index cc90c2b819dae..c0c10e0ac6a48 100644 --- a/paddle/phi/api/lib/sparse_api.cc +++ b/paddle/phi/api/lib/sparse_api.cc @@ -51,7 +51,7 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x, // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_coo"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_coo"; @@ -112,7 +112,7 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "dense_to_sparse_csr"; if (x.layout() == phi::DataLayout::SPARSE_COO) { kernel_name = "sparse_coo_to_csr"; @@ -179,7 +179,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { // 1. Get kernel signature and kernel auto kernel_key_set = ParseKernelKeyByInputArgs(x); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); std::string kernel_name = "sparse_coo_to_dense"; if (x.layout() == phi::DataLayout::SPARSE_CSR) { kernel_name = "sparse_csr_to_dense"; diff --git a/paddle/phi/infermeta/backward.cc b/paddle/phi/infermeta/backward.cc index 643a6dc9ddf36..7d403fee94300 100644 --- a/paddle/phi/infermeta/backward.cc +++ b/paddle/phi/infermeta/backward.cc @@ -76,6 +76,23 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x, } } +void GeneralTernaryGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& z, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dz) { + if (dx) { + dx->share_meta(x); + } + if (dy) { + dy->share_meta(y); + } + if (dz) { + dz->share_meta(z); + } +} + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, diff --git a/paddle/phi/infermeta/backward.h b/paddle/phi/infermeta/backward.h index 5afa678ddac70..c7090ed664b28 100644 --- a/paddle/phi/infermeta/backward.h +++ b/paddle/phi/infermeta/backward.h @@ -34,6 +34,13 @@ void GeneralBinaryGradInferMeta(const MetaTensor& x, MetaTensor* dx, MetaTensor* dy); +void GeneralTernaryGradInferMeta(const MetaTensor& x, + const MetaTensor& y, + const MetaTensor& z, + MetaTensor* dx, + MetaTensor* dy, + MetaTensor* dz); + void GumbelSoftmaxGradInferMeta(const MetaTensor& out, const MetaTensor& dout, int axis, diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index 8109d3879cb21..a902bd605542c 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -25,7 +25,8 @@ void EmptyKernel(const Context& dev_ctx, const ScalarArray& shape, DataType dtype, DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); + out->Resize(phi::make_ddim(shape.GetData())); + dev_ctx.template Alloc(out); } template diff --git a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h index 8b94fa1d22eb5..f2549c171dda0 100644 --- a/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h +++ b/paddle/phi/kernels/impl/matmul_grad_kernel_impl.h @@ -596,7 +596,6 @@ void MatmulDoubleGradKernel(const Context& dev_ctx, ddout_flag = true; } } - if (ddy) { auto ddy_mat = ddy.get(); if (ddy_mat.dims() != y_help.dims()) { diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h index b6ca081e97866..829b93b88b4f9 100644 --- a/paddle/phi/tests/api/scale_api.h +++ b/paddle/phi/tests/api/scale_api.h @@ -42,7 +42,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x, kernel_layout == DataLayout::UNDEFINED || kernel_data_type == DataType::UNDEFINED) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); if (kernel_backend == Backend::UNDEFINED) { kernel_backend = kernel_key.backend(); } @@ -215,7 +215,7 @@ Tensor scale_switch_case(const Tensor& x, kernel_layout == DataLayout::UNDEFINED || kernel_data_type == DataType::UNDEFINED) { auto kernel_key_set = ParseKernelKeyByInputArgs(x); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); if (kernel_backend == Backend::UNDEFINED) { kernel_backend = kernel_key.backend(); } diff --git a/paddle/phi/tests/api/test_matmul_api.cc b/paddle/phi/tests/api/test_matmul_api.cc index e5fc9c5b1f64b..2a3dd9c7dff62 100644 --- a/paddle/phi/tests/api/test_matmul_api.cc +++ b/paddle/phi/tests/api/test_matmul_api.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/phi/api/backward/backward_api.h" #include "paddle/phi/api/include/api.h" #include "paddle/phi/api/lib/utils/allocator.h" @@ -161,5 +162,31 @@ TEST(API, matmul_cuda) { #endif +TEST(API, matmul_double_grad) { + // 1. create tensor + auto x = paddle::experimental::full({3, 3}, 1.0); + auto y = paddle::experimental::full({3, 3}, 2.0); + auto out_grad = paddle::experimental::full({3, 3}, 2.0); + auto dx_grad = paddle::experimental::full({3, 3}, 2.0); + + // 2. test API + const auto out = paddle::experimental::matmul_double_grad( + x, y, out_grad, dx_grad, {}, false, false); + + // 3. check result + ASSERT_EQ(out.size(), 3UL); + ASSERT_EQ(out[0].size(), 1UL); + ASSERT_EQ(out[1].size(), 1UL); + ASSERT_EQ(out[2].size(), 1UL); + ASSERT_EQ(out[0][0].dims()[1], 3); + ASSERT_EQ(out[0][0].numel(), 9); + ASSERT_EQ(out[1][0].numel(), 9); + ASSERT_EQ(out[2][0].numel(), 9); + ASSERT_EQ(out[0][0].type(), phi::DataType::FLOAT32); + ASSERT_EQ(out[0][0].layout(), phi::DataLayout::NCHW); + ASSERT_EQ(out[1][0].initialized(), true); + ASSERT_EQ(out[2][0].initialized(), true); +} + } // namespace tests } // namespace paddle diff --git a/paddle/utils/optional.h b/paddle/utils/optional.h index d2a9a3f11ef3c..eec5f32be7226 100644 --- a/paddle/utils/optional.h +++ b/paddle/utils/optional.h @@ -20,6 +20,7 @@ #pragma once #include +#include #include #include #include diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py index 997b64db96791..5fc9dfe3f6499 100644 --- a/python/paddle/utils/code_gen/api_base.py +++ b/python/paddle/utils/code_gen/api_base.py @@ -35,7 +35,7 @@ def __init__(self, api_item_yaml): # args_str: # args_declare : "str" // str of function params with default value. Example: (..., bool flag=false) # args_define : "str" // str of function params without default value. Example: (..., bool flag) - self.inputs, self.attrs, self.outputs, self.args_str = self.parse_args( + self.inputs, self.attrs, self.outputs, self.args_str, self.optional_vars = self.parse_args( self.api, api_item_yaml) self.is_base_api = True @@ -57,17 +57,22 @@ def get_api_func_name(self): return self.api def parse_args(self, api_name, api_item_yaml): + optional_vars = [] + if 'optional' in api_item_yaml: + optional_vars = [ + item.strip() for item in api_item_yaml['optional'].split(',') + ] inputs, attrs, args_str = self.parse_input_and_attr( - api_name, api_item_yaml['args']) + api_name, api_item_yaml['args'], optional_vars) output_type_list, output_names, return_type = self.parse_output( api_name, api_item_yaml['output']) return inputs, attrs, { 'names': output_names, 'types': output_type_list, 'return_type': return_type - }, args_str + }, args_str, optional_vars - def parse_input_and_attr(self, api_name, args_config): + def parse_input_and_attr(self, api_name, args_config, optional_vars=[]): inputs = {'names': [], 'input_info': {}} attrs = {'names': [], 'attr_info': {}} args_str = args_config.strip() @@ -79,11 +84,43 @@ def parse_input_and_attr(self, api_name, args_config): 'Tensor': 'const Tensor&', 'Tensor[]': 'const std::vector&' } - attr_types_map = {'ScalarArray' : 'const ScalarArray&', 'Scalar' : 'const Scalar&', \ - 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ - 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ - 'Backend' : 'Backend', 'DataLayout' : 'DataLayout', 'DataType' : 'DataType', \ - 'int64_t[]' : 'const std::vector&', 'int[]' : 'const std::vector&'} + attr_types_map = { + 'ScalarArray': 'const ScalarArray&', + 'Scalar': 'const Scalar&', + 'int': 'int', + 'int32_t': 'int32_t', + 'int64_t': 'int64_t', + 'long': 'long', + 'size_t': 'size_t', + 'float': 'float', + 'double': 'double', + 'bool': 'bool', + 'Backend': 'Backend', + 'DataLayout': 'DataLayout', + 'DataType': 'DataType', + 'int64_t[]': 'const std::vector&', + 'int[]': 'const std::vector&', + 'long[]': 'const std::vector&' + } + optional_types_trans = { + 'Tensor': 'const paddle::optional&', + 'Tensor[]': 'const paddle::optional>&', + 'ScalarArray': 'const paddle::optional&', + 'Scalar': 'const paddle::optional&', + 'int': 'paddle::optional', + 'int32_t': 'paddle::optional', + 'int64_t': 'paddle::optional', + 'size_t': 'paddle::optional', + 'float': 'paddle::optional', + 'double': 'paddle::optional', + 'bool': 'paddle::optional', + 'Backend': 'paddle::optional', + 'DataLayout': 'paddle::optional', + 'DataType': 'paddle::optional', + 'int64_t[]': 'paddle::optional>', + 'int[]': 'paddle::optional>' + } + args_declare_str = "" args_define_str = "" @@ -100,6 +137,9 @@ def parse_input_and_attr(self, api_name, args_config): assert len(attrs['names']) == 0, \ f"The input Tensor should appear before attributes. please check the position of {api_name}:input({input_name}) in yaml" + if input_name in optional_vars: + in_type = optional_types_trans[in_type_symbol] + inputs['names'].append(input_name) inputs['input_info'][input_name] = in_type args_declare_str = args_declare_str + in_type + ' ' + input_name + ', ' @@ -121,6 +161,9 @@ def parse_input_and_attr(self, api_name, args_config): attr_name = attr_infos[0].strip() default_value = attr_infos[1].strip() + if attr_name in optional_vars: + attr_type = optional_types_trans[attr_type_symbol] + default_value_str = "" if default_value is None else '=' + default_value args_declare_str = args_declare_str + attr_type + ' ' + attr_name + default_value_str + ', ' args_define_str = args_define_str + attr_type + ' ' + attr_name + ', ' @@ -381,7 +424,7 @@ def gene_kernel_select(self) -> str: || kernel_layout == DataLayout::UNDEFINED || kernel_data_type == DataType::UNDEFINED ) {{ auto kernel_key_set = ParseKernelKeyByInputArgs({kernel_select_args}); - auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); + auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey(); if (kernel_backend == Backend::UNDEFINED) {{ kernel_backend = kernel_key.backend(); }} @@ -408,7 +451,17 @@ def gene_infer_meta(self, kernel_output_names, code_indent) -> str: param_code = "" for param in infer_meta_params: if param in input_names: - param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " + if param in self.optional_vars: + meta_tensor_code = meta_tensor_code + f""" +{code_indent} paddle::optional {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none); +{code_indent} auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param}); +{code_indent} if ({PREFIX_TENSOR_NAME}meta_{param}) {{ +{code_indent} {PREFIX_TENSOR_NAME}meta_ref_{param} = paddle::make_optional(*{PREFIX_TENSOR_NAME}meta_{param}); +{code_indent} }}""" + + param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, " + else: + param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), " elif param in kernel_output_names: meta_tensor_code = meta_tensor_code + code_indent + " phi::MetaTensor " + param.replace( 'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n" @@ -435,7 +488,11 @@ def get_kernel_args(self, code_indent): 'const std::vector&': 'const std::vector&', 'const std::vector &': - 'const std::vector&' + 'const std::vector&', + 'const paddle::optional&': + 'paddle::optional', + 'const paddle::optional>&': + 'paddle::optional&>' } out_trans_map = { 'Tensor': 'phi::DenseTensor*', @@ -459,19 +516,40 @@ def get_kernel_args(self, code_indent): trans_flag = "{true}" elif input_name in self.data_transform['support_trans_dtype']: trans_flag = "{false, true}" - input_tensor_code = input_tensor_code + f""" + if input_name in self.optional_vars: + input_tensor_code = input_tensor_code + f""" +{code_indent} {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none); +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_ptr = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag}); +{code_indent} if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{ +{code_indent} {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional(*{PREFIX_TENSOR_NAME}{input_name}_ptr); +{code_indent} }}""" + + else: + input_tensor_code = input_tensor_code + f""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({i}), {trans_flag});""" else: - input_tensor_code = input_tensor_code + f""" + if input_name in self.optional_vars: + input_tensor_code = input_tensor_code + f""" +{code_indent} {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none); +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToDenseTensor({input_name}); +{code_indent} if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{ +{code_indent} {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional(*{PREFIX_TENSOR_NAME}{input_name}_ptr); +{code_indent} }}""" + + else: + input_tensor_code = input_tensor_code + f""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = TensorToDenseTensor({input_name});""" kernel_args = "*dev_ctx, " for param in kernel_param: if param in input_names: - kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " - kernel_args_type_list.append(input_trans_map[input_infos[ - param]]) + if param in self.optional_vars: + kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " + else: + kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + kernel_in_type = input_trans_map[input_infos[param]] + kernel_args_type_list.append(kernel_in_type) elif param in attr_names: # set attr for kernel_context if 'ScalarArray' in self.attrs['attr_info'][param][0]: @@ -499,21 +577,16 @@ def get_kernel_args(self, code_indent): def get_selected_rows_kernel_args(self, code_indent): input_trans_map = { 'const Tensor&': 'const phi::SelectedRows&', - 'const Tensor &': 'const phi::SelectedRows&' + 'const Tensor &': 'const phi::SelectedRows&', + 'const paddle::optional&': + 'paddle::optional' } out_trans_map = {'Tensor': 'phi::SelectedRows*'} input_names = self.inputs['names'] input_infos = self.inputs['input_info'] kernel_args_type_list = ['const platform::DeviceContext&'] - input_tensor_code = "" - for input_name in input_names: - # set input code - input_tensor_code = input_tensor_code + f""" - auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});""" - attr_names = self.attrs['names'] - kernel_param = self.kernel['param'] if kernel_param is None: kernel_param = input_names + attr_names @@ -521,15 +594,28 @@ def get_selected_rows_kernel_args(self, code_indent): input_tensor_code = "" for i, input_name in enumerate(input_names): # set input code - input_tensor_code = input_tensor_code + f""" + if input_name in self.optional_vars: + input_tensor_code = input_tensor_code + f""" + +{code_indent} {input_trans_map[input_infos[input_name]]} {PREFIX_TENSOR_NAME}{input_name}(paddle::none); +{code_indent} auto {PREFIX_TENSOR_NAME}{input_name}_ptr = TensorToSelectedRows({input_name}); +{code_indent} if ({PREFIX_TENSOR_NAME}{input_name}_ptr) {{ +{code_indent} {PREFIX_TENSOR_NAME}{input_name} = paddle::make_optional(*{PREFIX_TENSOR_NAME}{input_name}_ptr); +{code_indent} }}""" + + else: + input_tensor_code = input_tensor_code + f""" {code_indent} auto {PREFIX_TENSOR_NAME}{input_name} = TensorToSelectedRows({input_name});""" kernel_args = "*dev_ctx, " for param in kernel_param: if param in input_names: - kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " - kernel_args_type_list.append(input_trans_map[input_infos[ - param]]) + if param in self.optional_vars: + kernel_args = kernel_args + PREFIX_TENSOR_NAME + param + ", " + else: + kernel_args = kernel_args + "*" + PREFIX_TENSOR_NAME + param + ", " + kernel_in_type = input_trans_map[input_infos[param]] + kernel_args_type_list.append(kernel_in_type) elif param in attr_names: # set attr for kernel_context if 'ScalarArray' in self.attrs['attr_info'][param][0]: diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py index 77af217f7b52e..a26630ad04100 100644 --- a/python/paddle/utils/code_gen/api_gen.py +++ b/python/paddle/utils/code_gen/api_gen.py @@ -92,6 +92,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" """ diff --git a/python/paddle/utils/code_gen/backward.yaml b/python/paddle/utils/code_gen/backward.yaml index 62b724432e928..cdda5cb1f05e8 100644 --- a/python/paddle/utils/code_gen/backward.yaml +++ b/python/paddle/utils/code_gen/backward.yaml @@ -8,6 +8,17 @@ kernel : func : matmul_grad +- backward_api : matmul_double_grad + forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor(dy) + args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) + output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad) + infer_meta : + func : GeneralTernaryGradInferMeta + param : [x, y, out_grad] + kernel : + func : matmul_double_grad + optional : dx_grad, dy_grad + - backward_api : scale_grad forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out) args : (Tensor out_grad, Scalar scale, float bias=0.0, bool bias_after_scale=true) @@ -15,15 +26,6 @@ invoke : scale(out_grad, scale, bias, bias_after_scale) # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future. -# -# - backward_api : matmul_double_grad -# forward : matmul_grad (Tensor x, Tensor y, Tensor out_grad, bool transpose_x, bool transpose_y) -> Tensor(dx), Tensor>(dy) -# args : (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -# output : Tensor(d2x), Tensor(d2y), Tensor(dout_grad) -# infer_meta : -# func : MatmulDoubleGradInferMeta -# kernel : -# func : matmul_double_grad # - backward_api : matmul_triple_grad # forward : matmul_double_grad (Tensor x, Tensor y, Tensor out_grad, Tensor dx_grad, Tensor dy_grad, bool transpose_x, bool transpose_y) -> Tensor(d2x), Tensor(d2y), Tensor(dout_grad) diff --git a/python/paddle/utils/code_gen/backward_api_gen.py b/python/paddle/utils/code_gen/backward_api_gen.py index bde5d4c90b907..2d33cd5b1812a 100644 --- a/python/paddle/utils/code_gen/backward_api_gen.py +++ b/python/paddle/utils/code_gen/backward_api_gen.py @@ -31,10 +31,10 @@ def get_api_name(self, api_item_yaml): def parse_forward_config(self, forward_config): # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out) result = re.search( - r"(?P[a-z][a-z0-9_]+)\s*(?P\([^\)]+\))\s*->[^\(]*\((?P[^\)]+)\)", + r"(?P[a-z][a-z0-9_]+)\s*(?P\([^\)]+\))\s*->\s*(?P.+)", forward_config) api = result.group('api') - outputs = [item.strip() for item in result.group('outputs').split(',')] + _, outputs, _ = self.parse_output(self.api, result.group('outputs')) fw_inputs, fw_attrs, _, = self.parse_input_and_attr( api, result.group('args')) @@ -47,7 +47,7 @@ def check_args(self, forward_config): # check the inputs of backward for input in self.inputs['names']: - if input not in fw_inputs and input not in fw_outputs: + if input not in fw_inputs['names'] and input not in fw_outputs: if input.endswith('_grad'): original_name = input[:-5] assert original_name in fw_outputs, \ @@ -132,6 +132,7 @@ def header_include(): #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" +#include "paddle/utils/optional.h" """ From 27536a322ba9d5127374e70e36da4d79166be5da Mon Sep 17 00:00:00 2001 From: Wilber Date: Mon, 28 Feb 2022 10:44:46 +0800 Subject: [PATCH 78/85] infrt add trt engine (#39885) --- paddle/fluid/platform/dynload/tensorrt.h | 12 +- paddle/infrt/CMakeLists.txt | 1 + paddle/infrt/backends/CMakeLists.txt | 3 + paddle/infrt/backends/tensorrt/CMakeLists.txt | 3 + .../backends/tensorrt/test_trt_engine.cc | 254 ++++++++++++ paddle/infrt/backends/tensorrt/trt_engine.cc | 365 ++++++++++++++++++ paddle/infrt/backends/tensorrt/trt_engine.h | 114 ++++++ paddle/infrt/backends/tensorrt/trt_options.h | 94 +++++ paddle/infrt/backends/tensorrt/trt_utils.h | 147 +++++++ paddle/infrt/kernel/phi/CMakeLists.txt | 4 + .../infershaped/infershape_launchers_test.cc | 2 +- tools/infrt/get_phi_kernel_info.py | 2 +- 12 files changed, 993 insertions(+), 8 deletions(-) create mode 100644 paddle/infrt/backends/CMakeLists.txt create mode 100644 paddle/infrt/backends/tensorrt/CMakeLists.txt create mode 100644 paddle/infrt/backends/tensorrt/test_trt_engine.cc create mode 100644 paddle/infrt/backends/tensorrt/trt_engine.cc create mode 100644 paddle/infrt/backends/tensorrt/trt_engine.h create mode 100644 paddle/infrt/backends/tensorrt/trt_options.h create mode 100644 paddle/infrt/backends/tensorrt/trt_utils.h diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index bc29a0472041a..c2d7eef582369 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle(); extern std::once_flag tensorrt_plugin_dso_flag; extern void* tensorrt_plugin_dso_handle; -#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name) \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_(__name) \ struct DynLoad__##__name { \ template \ void* operator()(Args... args) { \ @@ -55,7 +55,7 @@ extern void* tensorrt_plugin_dso_handle; }; \ extern DynLoad__##__name __name -#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_(__name) \ struct DynLoad__##__name { \ template \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ @@ -72,7 +72,7 @@ extern void* tensorrt_plugin_dso_handle; }; \ extern DynLoad__##__name __name -#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name) \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_(__name) \ struct DynLoad__##__name { \ template \ auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ @@ -109,10 +109,10 @@ extern void* tensorrt_plugin_dso_handle; #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \ __macro(initLibNvInferPlugins); -TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP) +TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP_) TENSORRT_RAND_ROUTINE_EACH_NON_POINTER( - DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP) -TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP) + DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP_) +TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP_) #endif // end of NV_TENSORRT_MAJOR diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index f2a78db558ee2..dc22eecc99cdd 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -74,6 +74,7 @@ endif() add_subdirectory(api) +add_subdirectory(backends) add_subdirectory(common) add_subdirectory(dialect) add_subdirectory(host_context) diff --git a/paddle/infrt/backends/CMakeLists.txt b/paddle/infrt/backends/CMakeLists.txt new file mode 100644 index 0000000000000..b639f89292568 --- /dev/null +++ b/paddle/infrt/backends/CMakeLists.txt @@ -0,0 +1,3 @@ +if (INFRT_WITH_PHI AND WITH_GPU AND WITH_TENSORRT) + add_subdirectory(tensorrt) +endif() diff --git a/paddle/infrt/backends/tensorrt/CMakeLists.txt b/paddle/infrt/backends/tensorrt/CMakeLists.txt new file mode 100644 index 0000000000000..cc20c9a2e14b6 --- /dev/null +++ b/paddle/infrt/backends/tensorrt/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_library(infrt_trt SRCS trt_engine.cc DEPS glog phi_dynload_cuda phi) + +cc_test_tiny(test_infrt_trt SRCS test_trt_engine.cc DEPS infrt_trt phi_dynload_cuda tensorrt_converter) diff --git a/paddle/infrt/backends/tensorrt/test_trt_engine.cc b/paddle/infrt/backends/tensorrt/test_trt_engine.cc new file mode 100644 index 0000000000000..54b7bc3e8af83 --- /dev/null +++ b/paddle/infrt/backends/tensorrt/test_trt_engine.cc @@ -0,0 +1,254 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/infrt/backends/tensorrt/trt_engine.h" +#include "paddle/infrt/backends/tensorrt/trt_options.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/data_type.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/allocator.h" +#include "paddle/phi/core/ddim.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/meta_tensor.h" + +namespace infrt { +namespace backends { +namespace tensorrt { + +const char* model_input = "model_input"; +const char* model_output = "model_output1"; +const char* model_output2 = "model_output2"; + +TrtUniquePtr ConstructNetwork( + nvinfer1::IBuilder* builder, nvinfer1::Dims dims, bool is_static_shape) { + TrtUniquePtr network; + if (is_static_shape) { + network.reset(builder->createNetworkV2(0U)); + } else { + auto networkFlags = + 1U << static_cast( + nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH); + network.reset(builder->createNetworkV2(networkFlags)); + } + + ITensor* data = + network->addInput(model_input, nvinfer1::DataType::kFLOAT, dims); + CHECK_NOTNULL(data); + IActivationLayer* act = + network->addActivation(*data, ActivationType::kSIGMOID); + CHECK_NOTNULL(act); + auto* act_out = act->getOutput(0); + std::vector output_length{1, 2}; + int axis; + nvinfer1::IPluginV2Layer* split_layer; + if (is_static_shape) { + axis = 0; + paddle::inference::tensorrt::plugin::SplitPlugin plugin( + axis, output_length, false); + split_layer = network->addPluginV2(&act_out, 1, plugin); + } else { + axis = 1; + paddle::inference::tensorrt::plugin::SplitPluginDynamic plugin( + axis, output_length, false); + split_layer = network->addPluginV2(&act_out, 1, plugin); + } + + split_layer->getOutput(0)->setName(model_output); + split_layer->getOutput(1)->setName(model_output2); + network->markOutput(*split_layer->getOutput(0)); + network->markOutput(*split_layer->getOutput(1)); + return network; +} + +// sigmoid(x) = 1 / (1 + exp(-x)) +inline float sigmoid(float x) { return 1.f / (1.f + exp(-1 * x)); } + +TEST(trt, run_static) { + TRTEngine static_trt_engine(0); + auto net = ConstructNetwork( + static_trt_engine.GetTrtBuilder(), nvinfer1::Dims3{3, 28, 28}, true); + BuildOptions static_build_options; + static_build_options.max_batch = 4; + static_trt_engine.Build(std::move(net), static_build_options); + InferenceOptions inference_options; + inference_options.batch = 2; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 3, 28, 28})); + phi::DenseTensor input; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 3 * 28 * 28, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + inputs.emplace(std::make_pair(model_input, &input)); + phi::DenseTensor output, output2; + std::unordered_map outputs; + outputs.emplace(std::make_pair(model_output, &output)); + outputs.emplace(std::make_pair(model_output2, &output2)); + + static_trt_engine.SetUpInference(inference_options, inputs, &outputs); + static_trt_engine.GetEngineInfo(); + static_trt_engine.Run(context); + + std::vector output_data1(inference_options.batch * 1 * 28 * 28, 0); + std::vector output_data2(inference_options.batch * 2 * 28 * 28, 0); + paddle::memory::Copy(phi::CPUPlace(), + output_data1.data(), + place, + output.data(), + sizeof(float) * output_data1.size(), + context.stream()); + paddle::memory::Copy(phi::CPUPlace(), + output_data2.data(), + place, + output2.data(), + sizeof(float) * output_data2.size(), + context.stream()); + cudaStreamSynchronize(context.stream()); + + for (size_t i = 0; i < host_data.size(); ++i) { + int w = i % 28; + int h = (i / 28) % 28; + int c = i / (28 * 28) % 3; + int n = i / (28 * 28 * 3); + if (c == 0) { + CHECK_NEAR( + sigmoid(host_data[i]), output_data1[n * 28 * 28 + h * 28 + w], 1e-5); + } else { + CHECK_NEAR(sigmoid(host_data[i]), + output_data2[n * 28 * 28 * 2 + (c - 1) * 28 * 28 + h * 28 + w], + 1e-5); + } + } +} + +TEST(trt, run_dynamic) { + TRTEngine engine(0); + auto net = ConstructNetwork( + engine.GetTrtBuilder(), nvinfer1::Dims4{-1, 3, -1, -1}, false); + BuildOptions build_options; + build_options.max_batch = 4; + build_options.workspace = 32; + // build_options.fp16 = true; + std::vector min_shape{1, 3, 16, 16}; + std::vector opt_shape{2, 3, 28, 28}; + std::vector max_shape{4, 3, 28, 28}; + build_options.shapes[model_input][0] = min_shape; + build_options.shapes[model_input][1] = opt_shape; + build_options.shapes[model_input][2] = max_shape; + engine.Build(std::move(net), build_options); + + InferenceOptions inference_options; + inference_options.batch = 2; + + phi::GPUPlace place; + phi::GPUContext context; + context.PartialInitWithoutAllocator(); + context.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance() + .GetAllocator(place, context.stream()) + .get()); + context.PartialInitWithAllocator(); + + phi::DenseTensorMeta meta( + phi::DataType::FLOAT32, + phi::make_ddim({inference_options.batch, 3, 16, 16})); + phi::DenseTensor input, output, output2; + input.set_meta(meta); + context.Alloc(&input, input.numel() * sizeof(float)); + std::vector host_data(inference_options.batch * 3 * 16 * 16, 0); + for (size_t i = 0; i < host_data.size(); ++i) { + host_data[i] = i % 100 * 0.016f; + } + paddle::memory::Copy(place, + input.data(), + phi::CPUPlace(), + host_data.data(), + sizeof(float) * host_data.size(), + context.stream()); + + std::unordered_map inputs; + std::unordered_map outputs; + inputs.emplace(std::make_pair(model_input, &input)); + outputs.emplace(std::make_pair(model_output, &output)); + outputs.emplace(std::make_pair(model_output2, &output2)); + + engine.SetUpInference(inference_options, inputs, &outputs); + engine.GetEngineInfo(); + engine.Run(context); + + std::vector output_data1(inference_options.batch * 1 * 16 * 16, 0); + std::vector output_data2(inference_options.batch * 2 * 16 * 16, 0); + paddle::memory::Copy(phi::CPUPlace(), + output_data1.data(), + place, + output.data(), + sizeof(float) * output_data1.size(), + context.stream()); + paddle::memory::Copy(phi::CPUPlace(), + output_data2.data(), + place, + output2.data(), + sizeof(float) * output_data2.size(), + context.stream()); + cudaStreamSynchronize(context.stream()); + + for (size_t i = 0; i < host_data.size(); ++i) { + int w = i % 16; + int h = (i / 16) % 16; + int c = i / (16 * 16) % 3; + int n = i / (16 * 16 * 3); + if (c == 0) { + CHECK_NEAR( + sigmoid(host_data[i]), output_data1[n * 16 * 16 + h * 16 + w], 1e-5); + } else { + CHECK_NEAR(sigmoid(host_data[i]), + output_data2[n * 16 * 16 * 2 + (c - 1) * 16 * 16 + h * 16 + w], + 1e-5); + } + } +} + +} // namespace tensorrt +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/trt_engine.cc b/paddle/infrt/backends/tensorrt/trt_engine.cc new file mode 100644 index 0000000000000..a204fe42b4508 --- /dev/null +++ b/paddle/infrt/backends/tensorrt/trt_engine.cc @@ -0,0 +1,365 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/infrt/backends/tensorrt/trt_engine.h" + +#include +#include +#include "glog/logging.h" +#include "paddle/phi/backends/dynload/tensorrt.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/ddim.h" + +namespace infrt { +namespace backends { +namespace tensorrt { + +// The following two API are implemented in TensorRT's header file, cannot load +// from the dynamic library. So create our own implementation and directly +// trigger the method from the dynamic library. +static nvinfer1::IBuilder* createInferBuilder( + nvinfer1::ILogger& logger) { // NOLINT + return static_cast( + phi::dynload::createInferBuilder_INTERNAL(&logger, NV_TENSORRT_VERSION)); +} +static nvinfer1::IRuntime* createInferRuntime( + nvinfer1::ILogger& logger) { // NOLINT + return static_cast( + phi::dynload::createInferRuntime_INTERNAL(&logger, NV_TENSORRT_VERSION)); +} + +TRTEngine::TRTEngine(int device_id) : device_id_(device_id) { + FreshDeviceId(); + logger_.reset(new TrtLogger()); + builder_.reset(createInferBuilder(logger_->GetTrtLogger())); + phi::dynload::initLibNvInferPlugins(&logger_->GetTrtLogger(), ""); +} + +nvinfer1::IBuilder* TRTEngine::GetTrtBuilder() { + CHECK_NOTNULL(builder_); + return builder_.get(); +} + +void TRTEngine::Build(TrtUniquePtr network, + const BuildOptions& build_options) { + FreshDeviceId(); + ModelToBuildEnv(std::move(network), build_options); + CHECK_NOTNULL(engine_); +} + +bool TRTEngine::ModelToBuildEnv( + TrtUniquePtr network, + const BuildOptions& build) { + CHECK_NOTNULL(builder_); + std::swap(network, network_); + CHECK_NOTNULL(network_); + // ModelToNetwork(network_, logger); + NetworkToEngine(build); + return true; +} + +bool TRTEngine::NetworkToEngine(const BuildOptions& build) { + TrtUniquePtr config{builder_->createBuilderConfig()}; + CHECK_NOTNULL(config); + CHECK(SetupNetworkAndConfig(build, *network_, *config)); + +#if IS_TRT_VERSION_LT(8000) + engine_.reset(builder_->buildEngineWithConfig(*network_, *config)); +#else + serialized_engine_.reset( + builder_->buildSerializedNetwork(*network_, *config)); + CHECK_NOTNULL(serialized_engine_); + + TrtUniquePtr runtime{createInferRuntime(logger_->GetTrtLogger())}; + CHECK_NOTNULL(runtime); + engine_.reset(runtime->deserializeCudaEngine(serialized_engine_->data(), + serialized_engine_->size())); + CHECK_NOTNULL(engine_); +#endif + return true; +} + +bool TRTEngine::SetupNetworkAndConfig(const BuildOptions& build, + INetworkDefinition& network, + IBuilderConfig& config) { + builder_->setMaxBatchSize(build.max_batch); + // TODO(wilber): handle one engine - multi execution context case. + IOptimizationProfile* profile{nullptr}; + if (!build.shapes.empty()) { + profile = builder_->createOptimizationProfile(); + CHECK_NOTNULL(profile); + } + + // Set formats and data types of inputs + for (int32_t i = 0; i < network.getNbInputs(); ++i) { + auto* input = network.getInput(i); + if (!build.input_formats.empty()) { + input->setType(build.input_formats[i].first); + input->setAllowedFormats(build.input_formats[i].second); + } else { + switch (input->getType()) { + case DataType::kINT32: + case DataType::kBOOL: + case DataType::kHALF: + // Leave these as is. + break; + case DataType::kFLOAT: + case DataType::kINT8: + // User did not specify a floating-point format. Default to kFLOAT. + input->setType(DataType::kFLOAT); + break; + } + input->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + + if (profile) { + Dims dims = input->getDimensions(); + // TODO(wilber): shape tensor. + const bool is_dynamic_input = std::any_of( + dims.d, dims.d + dims.nbDims, [](int dim) { return dim == -1; }); + if (is_dynamic_input) { + is_dynamic_shape_ = true; + auto shape = build.shapes.find(input->getName()); + + // If no shape is provided + if (shape == build.shapes.end()) { + // TODO(wilber): add infomation. + CHECK(false); + } + LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; + std::vector profile_dims{}; + profile_dims = + shape->second[static_cast(OptProfileSelector::kMIN)]; + CHECK(profile->setDimensions(input->getName(), + OptProfileSelector::kMIN, + VecToDims(profile_dims))); + profile_dims = + shape->second[static_cast(OptProfileSelector::kOPT)]; + CHECK(profile->setDimensions(input->getName(), + OptProfileSelector::kOPT, + VecToDims(profile_dims))); + profile_dims = + shape->second[static_cast(OptProfileSelector::kMAX)]; + CHECK(profile->setDimensions(input->getName(), + OptProfileSelector::kMAX, + VecToDims(profile_dims))); + } + } + } + + if (profile && is_dynamic_shape_) { + CHECK(profile->isValid()); // Required optimization profile is invalid + CHECK_NE(config.addOptimizationProfile(profile), -1); + } + + // Set formats and data types of outputs + for (int32_t i = 0, n = network.getNbOutputs(); i < n; i++) { + auto* output = network.getOutput(i); + if (!build.output_formats.empty()) { + // int outputFormatIndex = broadcastOutputFormats ? 0 : i; + output->setType(build.output_formats[i].first); + output->setAllowedFormats(build.output_formats[i].second); + } else { + output->setAllowedFormats(1U << static_cast(TensorFormat::kLINEAR)); + } + } + + config.setMaxWorkspaceSize(static_cast(build.workspace) << 20); + + if (build.fp16) { + config.setFlag(BuilderFlag::kFP16); + bool support_fp16 = builder_->platformHasFastFp16(); + if (support_fp16) { + LOG(INFO) << "Run INFRT-TRT FP16 mode"; + } else { + LOG(INFO) << "You specify FP16 mode, but the hardware do not support " + "FP16 speed up, use FP32 instead."; + } + } + + if (build.tf32) { + config.setFlag(BuilderFlag::kTF32); + bool support_tf32 = builder_->platformHasTf32(); + if (support_tf32) { + LOG(INFO) << "Run INFRT-TRT TF32 mode"; + } else { + LOG(INFO) << "You specify TF32 mode, but the hardware do not support " + "TF32 speed up, use FP32 instead."; + } + } + + // TODO(wilber): other precision. + + // TODO(wilber): precision config. + switch (build.precision_constraints) { + case PrecisionConstraints::kNONE: + // It's the default for TensorRT. + break; + case PrecisionConstraints::kOBEY: + config.setFlag(BuilderFlag::kOBEY_PRECISION_CONSTRAINTS); + break; + case PrecisionConstraints::kPREFER: + config.setFlag(BuilderFlag::kPREFER_PRECISION_CONSTRAINTS); + break; + } + + // TODO(TRT): DLA config. + + // TODO(TRT): int8 config. + // TODO(TRT): support int8 + if (build.int8) { + assert(false); + config.setFlag(BuilderFlag::kINT8); + bool support_int8 = builder_->platformHasFastInt8(); + if (support_int8) { + LOG(INFO) << "Run INFRT-TRT FP16 mode"; + } + } + + // TODO(TRT): calib config. + + // TODO(TRT): sparse config. + + return true; +} + +bool TRTEngine::SetUpInference( + const InferenceOptions& inference, + const std::unordered_map& inputs, + std::unordered_map* outputs) { + // TODO(wilber): now only create one exec_context + FreshDeviceId(); + CHECK(engine_ != nullptr); + nvinfer1::IExecutionContext* ec = engine_->createExecutionContext(); + CHECK(ec != nullptr); + contexts_.emplace_back(ec); + bindings_.emplace_back(new Bindings()); + + for (const auto& it : inputs) { + const int bind_index = engine_->getBindingIndex(it.first.c_str()); + bindings_.front()->AddBinding( + bind_index, it.first, true, it.second, nvinfer1::DataType::kFLOAT); + } + for (auto& it : *outputs) { + const int bind_index = engine_->getBindingIndex(it.first.c_str()); + bindings_.front()->AddBinding( + bind_index, it.first, false, it.second, nvinfer1::DataType::kFLOAT); + } + + return true; +} + +void TRTEngine::Run(const phi::GPUContext& ctx) { + if (is_dynamic_shape_) { + DynamicRun(ctx); + } else { + StaticRun(ctx); + } +} + +void TRTEngine::StaticRun(const phi::GPUContext& ctx) { + const int num_bindings = engine_->getNbBindings(); + std::vector buffers(num_bindings, nullptr); + + int runtime_batch = -1; + auto input_binds = bindings_.front()->GetInputBindings(); + for (auto bind : input_binds) { + const int bind_index = engine_->getBindingIndex(bind.name.c_str()); + buffers[bind_index] = + const_cast(static_cast(bind.buffer->data())); + if (runtime_batch != -1) { + CHECK_EQ(runtime_batch, phi::vectorize(bind.buffer->dims())[0]); + } + runtime_batch = bind.buffer->dims()[0]; + } + + auto output_binds = bindings_.front()->GetOutputBindings(); + for (auto bind : output_binds) { + const int bind_index = engine_->getBindingIndex(bind.name.c_str()); + std::vector ddim; + auto dims = engine_->getBindingDimensions(bind_index); + ddim.push_back(runtime_batch); + for (int i = 0; i < dims.nbDims; ++i) { + ddim.push_back(dims.d[i]); + } + bind.buffer->Resize(phi::make_ddim(ddim)); + ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); + buffers[bind_index] = static_cast(bind.buffer->data()); + } + + contexts_.front()->enqueue( + runtime_batch, buffers.data(), ctx.stream(), nullptr); +} + +void TRTEngine::DynamicRun(const phi::GPUContext& ctx) { + const int num_bindings = engine_->getNbBindings(); + std::vector buffers(num_bindings, nullptr); + + auto input_binds = bindings_.front()->GetInputBindings(); + for (auto bind : input_binds) { + const int bind_index = engine_->getBindingIndex(bind.name.c_str()); + buffers[bind_index] = + const_cast(static_cast(bind.buffer->data())); + nvinfer1::Dims trt_dims; + trt_dims.nbDims = bind.buffer->dims().size(); + + for (int i = 0; i < trt_dims.nbDims; ++i) { + trt_dims.d[i] = bind.buffer->dims()[i]; + } + contexts_.front()->setBindingDimensions(bind_index, trt_dims); + } + + CHECK(contexts_.front()->allInputDimensionsSpecified()); + + auto output_binds = bindings_.front()->GetOutputBindings(); + for (auto bind : output_binds) { + const int bind_index = engine_->getBindingIndex(bind.name.c_str()); + auto dims = contexts_.front()->getBindingDimensions(bind_index); + std::vector ddim(dims.nbDims); + for (int i = 0; i < dims.nbDims; ++i) { + ddim[i] = dims.d[i]; + } + bind.buffer->Resize(phi::make_ddim(ddim)); + ctx.Alloc(bind.buffer, sizeof(float) * bind.buffer->numel()); + buffers[bind_index] = static_cast(bind.buffer->data()); + } + + contexts_.front()->enqueueV2(buffers.data(), ctx.stream(), nullptr); +} + +void TRTEngine::FreshDeviceId() { + int count; + cudaGetDeviceCount(&count); + CHECK_LT(device_id_, count); + phi::backends::gpu::SetDeviceId(device_id_); +} + +void TRTEngine::GetEngineInfo() { +#if IS_TRT_VERSION_GE(8200) + LOG(INFO) << "====== engine info ======"; + std::unique_ptr infer_inspector( + engine_->createEngineInspector()); + infer_inspector->setExecutionContext(contexts_.front().get()); + LOG(INFO) << infer_inspector->getEngineInformation( + nvinfer1::LayerInformationFormat::kONELINE); + LOG(INFO) << "====== engine info end ======"; +#else + LOG(INFO) << "Inspector needs TensorRT version 8.2 and after."; +#endif +} + +} // namespace tensorrt +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/trt_engine.h b/paddle/infrt/backends/tensorrt/trt_engine.h new file mode 100644 index 0000000000000..f72bdaf3ac0b4 --- /dev/null +++ b/paddle/infrt/backends/tensorrt/trt_engine.h @@ -0,0 +1,114 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/infrt/backends/tensorrt/trt_options.h" +#include "paddle/infrt/backends/tensorrt/trt_utils.h" +#include "paddle/phi/backends/dynload/tensorrt.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/place.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace backends { +namespace tensorrt { +using namespace nvinfer1; // NOLINT + +// The trt programing model as follows: +// 1. The build phase: +// IBuilder* builder = createInferBuilder(&logger_); +// 2. Create a network definition: +// INetworkDefinition* network = builder->createNetworkV2(...); +// 3. Build network: +// network->AddLayer(...) +// 4. Configure network: +// IBuilderConfig* config = builder->createBuilderConfig(); +// config->setMaxWorkspaceSize(...) +// 5. Get cuda engine and deserializing a plan: +// IHostMemory* serialized_model = builder->buildSerializedNetwork(...); +// IRuntime* runtime = createInferRuntime(&logger_); +// ICudaEngine* engine = runtime->deserializeCudaEngine(...); +// 6. Get execution context: +// IExecutionContext* exec_context = engine->createExecutionContext(); +// 7. Set input data: +// int32_t input_index = engine->getBindingIndex("input"); +// int32_t output_index = engine->getBindingIndex("output"); +// void* buffers[2]; +// buffers[input_index] = input_buffer; +// buffers[output_index] = output_buffer; +// 8. Performance inference: +// exec_context->enqueueV2(buffers, stream, nullptr); +// +// We have encapsulated this logic, please use the following programming model. +// +// TRTEngine trt_engine; +// trt_engine.Build(...); +// trt_engine.SetUpInference(...); +// trt_engine.Run(...); +class TRTEngine { + public: + explicit TRTEngine(int device_id); + + nvinfer1::IBuilder* GetTrtBuilder(); + + // TODO(wilber): Modify signature after infrt-trt ready. + void Build(TrtUniquePtr network, + const BuildOptions& build_options); + + // TODO(wilber): Modify signature after infrt-trt ready. + void Run(const phi::GPUContext& ctx); + + // TODO(wilber): How to support multiple execution contexts? + bool SetUpInference( + const InferenceOptions& inference, + const std::unordered_map& inputs, + std::unordered_map* outputs); + + void GetEngineInfo(); + + private: + void FreshDeviceId(); + + bool SetupNetworkAndConfig(const BuildOptions& build, + INetworkDefinition& network, // NOLINT + IBuilderConfig& config); // NOLINT + + bool NetworkToEngine(const BuildOptions& build); + + bool ModelToBuildEnv(TrtUniquePtr network, + const BuildOptions& build); + + void StaticRun(const phi::GPUContext& ctx); + + void DynamicRun(const phi::GPUContext& ctx); + + private: + std::unique_ptr logger_{nullptr}; + TrtUniquePtr builder_{nullptr}; + TrtUniquePtr network_{nullptr}; + std::unique_ptr serialized_engine_{nullptr}; + TrtUniquePtr engine_{nullptr}; + std::vector> contexts_; + std::vector> bindings_; + int device_id_{0}; + bool is_dynamic_shape_{false}; +}; + +} // namespace tensorrt +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/trt_options.h b/paddle/infrt/backends/tensorrt/trt_options.h new file mode 100644 index 0000000000000..d5190f5e6220e --- /dev/null +++ b/paddle/infrt/backends/tensorrt/trt_options.h @@ -0,0 +1,94 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include + +namespace infrt { +namespace backends { +namespace tensorrt { + +// Build default params +constexpr int32_t max_batch_not_provided{0}; +constexpr int32_t default_workspace{16}; +// Inference default params +constexpr int32_t default_batch{1}; +constexpr int32_t batch_not_provided{0}; + +enum class PrecisionConstraints { kNONE, kOBEY, kPREFER }; + +enum class SparsityFlag { kDISABLE, kENABLE, kFORCE }; + +using ShapeRange = + std::array, + nvinfer1::EnumMax()>; + +using IOFormat = std::pair; + +struct BuildOptions { + // Set max batch size. + int32_t max_batch{max_batch_not_provided}; + + // Set workspace size in megabytes (default = 16) + int32_t workspace{default_workspace}; + + // Enable tf32 precision, in addition to fp32 (default = disabled) + bool tf32{false}; + + // Enable fp16 precision, in addition to fp32 (default = disabled) + bool fp16{false}; + + // Enable int8 precision, in addition to fp32 (default = disabled) + bool int8{false}; + + // Control precision constraints. (default = none) + // Precision Constaints: = none, obey, prefer + // none = no constraints + // prefer = meet precision constraints if possible + // obey = meet precision constraints or fail otherwise + PrecisionConstraints precision_constraints{PrecisionConstraints::kNONE}; + + // Save the serialized engine. + bool save{false}; + + // Load a serialized engine. + bool load{false}; + + // Build with dynamic shapes using a profile with the min, max and opt shapes + // provided + std::unordered_map shapes; + + // Type and format of each of the input tensors (default = all inputs in + // fp32:chw) + std::vector input_formats; + + // Type and format of each of the output tensors (default = all outputs in + // fp32:chw) + std::vector output_formats; +}; + +struct InferenceOptions { + int32_t batch{batch_not_provided}; + std::unordered_map> shapes; +}; + +} // namespace tensorrt +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/backends/tensorrt/trt_utils.h b/paddle/infrt/backends/tensorrt/trt_utils.h new file mode 100644 index 0000000000000..4b129af1d5381 --- /dev/null +++ b/paddle/infrt/backends/tensorrt/trt_utils.h @@ -0,0 +1,147 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include "glog/logging.h" +#include "paddle/phi/core/dense_tensor.h" + +namespace infrt { +namespace backends { +namespace tensorrt { + +#define IS_TRT_VERSION_GE(version) \ + ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) >= version) + +#define IS_TRT_VERSION_LT(version) \ + ((NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD) < version) + +#define TRT_VERSION \ + NV_TENSORRT_MAJOR * 1000 + NV_TENSORRT_MINOR * 100 + \ + NV_TENSORRT_PATCH * 10 + NV_TENSORRT_BUILD + +inline nvinfer1::Dims VecToDims(const std::vector& vec) { + int limit = static_cast(nvinfer1::Dims::MAX_DIMS); + if (static_cast(vec.size()) > limit) { + assert(false); + } + // Pick first nvinfer1::Dims::MAX_DIMS elements + nvinfer1::Dims dims{std::min(static_cast(vec.size()), limit), {}}; + std::copy_n(vec.begin(), dims.nbDims, std::begin(dims.d)); + return dims; +} + +template +struct TrtDestroyer { + void operator()(T* t) { t->destroy(); } +}; + +template +using TrtUniquePtr = std::unique_ptr>; + +class TrtLogger : public nvinfer1::ILogger { + public: + void log(nvinfer1::ILogger::Severity severity, + const char* msg) noexcept override { + switch (severity) { + case Severity::kVERBOSE: + VLOG(3) << msg; + break; + case Severity::kINFO: + VLOG(2) << msg; + break; + case Severity::kWARNING: + LOG(WARNING) << msg; + break; + case Severity::kINTERNAL_ERROR: + case Severity::kERROR: + LOG(ERROR) << msg; + break; + default: + break; + } + } + nvinfer1::ILogger& GetTrtLogger() noexcept { return *this; } + ~TrtLogger() override = default; +}; + +struct Binding { + bool is_input{false}; + nvinfer1::DataType data_type{nvinfer1::DataType::kFLOAT}; + phi::DenseTensor* buffer{nullptr}; + std::string name; +}; + +class Bindings { + public: + Bindings() = default; + + void AddBinding(int32_t b, + const std::string& name, + bool is_input, + phi::DenseTensor* buffer, + nvinfer1::DataType data_type) { + while (bindings_.size() <= static_cast(b)) { + bindings_.emplace_back(); + } + names_[name] = b; + bindings_[b].buffer = buffer; + bindings_[b].is_input = is_input; + bindings_[b].data_type = data_type; + bindings_[b].name = name; + } + + std::vector GetInputBindings() { + return GetBindings([](const Binding& b) -> bool { return b.is_input; }); + } + + std::vector GetOutputBindings() { + return GetBindings([](const Binding& b) -> bool { return !b.is_input; }); + } + + std::vector GetBindings() { + return GetBindings([](const Binding& b) -> bool { return true; }); + } + + std::vector GetBindings( + std::function predicate) { + std::vector bindings; + for (const auto& b : bindings_) { + if (predicate(b)) { + bindings.push_back(b); + } + } + return bindings; + } + + private: + std::unordered_map names_; + std::vector bindings_; +}; + +} // namespace tensorrt +} // namespace backends +} // namespace infrt diff --git a/paddle/infrt/kernel/phi/CMakeLists.txt b/paddle/infrt/kernel/phi/CMakeLists.txt index 30a2621f4abdf..7055c0c06d590 100644 --- a/paddle/infrt/kernel/phi/CMakeLists.txt +++ b/paddle/infrt/kernel/phi/CMakeLists.txt @@ -18,6 +18,10 @@ set(wrapped_infermeta_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/infermeta/gener add_custom_command( OUTPUT ${infrt_register_phi_kernels_gen_source_file} + COMMAND sh ${infrt_register_phi_kernels_gen_file} + DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} + VERBATIM) +add_custom_target(infrt_register_phi_kernel COMMAND sh ${infrt_register_phi_kernels_gen_file} DEPENDS ${wrapped_infermeta_header_file} ${wrapped_infermeta_source_file} COMMENT "infrt generate ${infrt_register_phi_kernels_gen_source_file}" diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc index 331ebcfb4a5d2..2161e98fac833 100644 --- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc +++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc @@ -54,7 +54,7 @@ TEST(ElementwiseAdd, launcher_registry) { host_context::KernelRegistry registry; RegisterInferShapeLaunchers(®istry); ASSERT_GE(registry.size(), 1UL); - auto creator = registry.GetKernel("add.cpu.any.fp32"); + auto creator = registry.GetKernel("pten.add.cpu.any.fp32"); const phi::DDim dims({1, 2}); const phi::DataType dtype{phi::DataType::FLOAT32}; diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py index b0c834718b1b3..f3e9f345da27b 100644 --- a/tools/infrt/get_phi_kernel_info.py +++ b/tools/infrt/get_phi_kernel_info.py @@ -219,7 +219,7 @@ def gen_register_info(resources: List[List[str]]): for ir_dtype, origin_dtype in zip(ir_dtypes, origin_dtypes): kernel_func = gen_kernel_func(update_item[3], ctx_name, origin_dtype) - ir_name = '.'.join( + ir_name = 'pten.' + '.'.join( [it.lower() for it in update_item[:3]]) + "." + ir_dtype res += f""" registry->AddKernel("{ir_name}",""" From d1595c26d1d09f125de568626a1dcfeb5c08d6da Mon Sep 17 00:00:00 2001 From: zyfncg Date: Mon, 28 Feb 2022 10:45:03 +0800 Subject: [PATCH 79/85] [PHI] adjust the empty kernel and dev_api (#39958) * remove empty kernel in fluid and adjust the param of empty dev_api * polish code * revert fluid empty kernel --- paddle/phi/kernels/cpu/full_kernel.cc | 2 +- paddle/phi/kernels/empty_kernel.cc | 8 +- paddle/phi/kernels/empty_kernel.h | 10 +-- paddle/phi/kernels/full_kernel.h | 8 +- paddle/phi/kernels/impl/full_kernel_impl.h | 73 ------------------- .../tests/kernels/test_creation_dev_api.cc | 4 +- 6 files changed, 17 insertions(+), 88 deletions(-) delete mode 100644 paddle/phi/kernels/impl/full_kernel_impl.h diff --git a/paddle/phi/kernels/cpu/full_kernel.cc b/paddle/phi/kernels/cpu/full_kernel.cc index 6b0183d31c6ec..86576a861aa48 100644 --- a/paddle/phi/kernels/cpu/full_kernel.cc +++ b/paddle/phi/kernels/cpu/full_kernel.cc @@ -35,7 +35,7 @@ void FullKernel(const Context& dev_ctx, const Scalar& val, DataType dtype, DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); + out->Resize(phi::make_ddim(shape.GetData())); FullValue(dev_ctx, out, val.to()); } diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc index a902bd605542c..6e5f15fe1692b 100644 --- a/paddle/phi/kernels/empty_kernel.cc +++ b/paddle/phi/kernels/empty_kernel.cc @@ -69,7 +69,9 @@ PD_REGISTER_KERNEL(empty_like, phi::dtype::float16, phi::dtype::bfloat16, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(empty, @@ -101,5 +103,7 @@ PD_REGISTER_KERNEL(empty_like, phi::dtype::float16, phi::dtype::bfloat16, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); +} #endif diff --git a/paddle/phi/kernels/empty_kernel.h b/paddle/phi/kernels/empty_kernel.h index 54ba8b16c1d74..0b8d95ee94fb5 100644 --- a/paddle/phi/kernels/empty_kernel.h +++ b/paddle/phi/kernels/empty_kernel.h @@ -54,22 +54,20 @@ DenseTensor Empty(const Context& dev_ctx) { } template -DenseTensor Empty(const Context& dev_ctx, - const ScalarArray& shape, - DataType dtype = DataType::FLOAT32) { +DenseTensor Empty(const Context& dev_ctx, const ScalarArray& shape) { auto dense_out = Empty(dev_ctx); MetaTensor meta_out(&dense_out); + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateInferMeta(shape, dtype, &meta_out); EmptyKernel(dev_ctx, shape, dtype, &dense_out); return dense_out; } template -DenseTensor EmptyLike(const Context& dev_ctx, - const DenseTensor& x, - DataType dtype = DataType::UNDEFINED) { +DenseTensor EmptyLike(const Context& dev_ctx, const DenseTensor& x) { auto dense_out = Empty(dev_ctx); MetaTensor meta_out(&dense_out); + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateLikeInferMeta(x, dtype, &meta_out); EmptyLikeKernel(dev_ctx, x, dtype, &dense_out); return dense_out; diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h index 394aab8f96e1a..c7b1f9af0e319 100644 --- a/paddle/phi/kernels/full_kernel.h +++ b/paddle/phi/kernels/full_kernel.h @@ -48,10 +48,10 @@ void FullLikeKernel(const Context& dev_ctx, template DenseTensor Full(const Context& dev_ctx, const ScalarArray& shape, - const Scalar& val, - DataType dtype = DataType::FLOAT32) { + const Scalar& val) { auto dense_out = Empty(dev_ctx); MetaTensor meta_out(&dense_out); + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateInferMeta(shape, dtype, &meta_out); FullKernel(dev_ctx, shape, val, dtype, &dense_out); return dense_out; @@ -60,10 +60,10 @@ DenseTensor Full(const Context& dev_ctx, template DenseTensor FullLike(const Context& dev_ctx, const DenseTensor& x, - const Scalar& val, - DataType dtype = DataType::UNDEFINED) { + const Scalar& val) { auto dense_out = Empty(dev_ctx); MetaTensor meta_out(&dense_out); + DataType dtype = paddle::experimental::CppTypeToDataType::Type(); CreateLikeInferMeta(x, dtype, &meta_out); FullLikeKernel(dev_ctx, x, val, dtype, &dense_out); return dense_out; diff --git a/paddle/phi/kernels/impl/full_kernel_impl.h b/paddle/phi/kernels/impl/full_kernel_impl.h deleted file mode 100644 index 8cced49906ecc..0000000000000 --- a/paddle/phi/kernels/impl/full_kernel_impl.h +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/phi/common/scalar.h" -#include "paddle/phi/common/scalar_array.h" -#include "paddle/phi/core/dense_tensor.h" -#include "paddle/phi/kernels/funcs/eigen/common.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace phi { - -template -void FullValue(const Context& dev_ctx, DenseTensor* tensor, VType val) { - dev_ctx.template Alloc(tensor); - auto t = phi::EigenVector::Flatten(*tensor); - t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(val)); -} - -template -void FullKernel(const Context& dev_ctx, - const ScalarArray& shape, - const Scalar& val, - DenseTensor* out) { - out->ResizeAndAllocate(phi::make_ddim(shape.GetData())); - FullValue(dev_ctx, out, val.to()); -} - -template -void FullLikeKernel(const Context& dev_ctx, - const Scalar& val, - DenseTensor* out) { - auto value = val.to(); - using CommonType = typename std::common_type< - float, - typename std::conditional::value, - float, - T>::type>::type; - - auto common_type_value = static_cast(value); - - PADDLE_ENFORCE_EQ( - (common_type_value >= - static_cast(std::numeric_limits::lowest())) && - (common_type_value <= - static_cast(std::numeric_limits::max())), - true, - phi::errors::InvalidArgument( - "The filled value is out of range for target type, " - "current kernel type is %s, the range should between %f " - "and %f, but now value is %f.", - typeid(T).name(), - static_cast(std::numeric_limits::lowest()), - static_cast(std::numeric_limits::max()), - static_cast(value))); - FullValue(dev_ctx, out, value); -} - -} // namespace phi diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc index e4f80a5bd19eb..8c2c8642ab900 100644 --- a/paddle/phi/tests/kernels/test_creation_dev_api.cc +++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc @@ -39,7 +39,7 @@ TEST(DEV_API, empty) { dev_ctx.Init(); // 2. test API - auto out = phi::Empty(dev_ctx, {3, 2}, phi::DataType::INT32); + auto out = phi::Empty(dev_ctx, {3, 2}); // 3. check result ASSERT_EQ(out.dims().size(), 2); @@ -87,7 +87,7 @@ TEST(DEV_API, full) { .GetAllocator(paddle::platform::CPUPlace()) .get()); dev_ctx.Init(); - auto out = phi::Full(dev_ctx, {3, 2}, val, phi::DataType::FLOAT32); + auto out = phi::Full(dev_ctx, {3, 2}, val); // 3. check result ASSERT_EQ(out.dims().size(), 2); From 18ee051eb98afcfa433615f24beb046e6dea95f6 Mon Sep 17 00:00:00 2001 From: zhangbo9674 <82555433+zhangbo9674@users.noreply.github.com> Date: Mon, 28 Feb 2022 10:46:54 +0800 Subject: [PATCH 80/85] [bf16] Refine BF16 amp-o1 logic (#39815) * refine bf16 amp-o1 logic * refine amp GLOG * refine unittest * refine unittest --- paddle/fluid/imperative/amp_auto_cast.cc | 30 ++++++++++++++++--- paddle/fluid/imperative/tracer.cc | 6 ++-- python/paddle/fluid/dygraph/amp/auto_cast.py | 2 +- .../test_imperative_auto_mixed_precision.py | 19 ++++++++---- 4 files changed, 45 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 94c6d0a4d569a..6e8bfbb4a7761 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -273,8 +273,9 @@ static inline std::shared_ptr CastToBF16( template static inline framework::proto::VarType::Type GetPromoteType( - const std::string& op_type, const NameVarMap& ins) { - auto dst_type = framework::proto::VarType::FP16; + const std::string& op_type, const NameVarMap& ins, + const framework::proto::VarType::Type amp_dtype) { + auto dst_type = amp_dtype; for (const auto& pair : ins) { for (const auto& var : pair.second) { if (GetDataType(var) == framework::proto::VarType::FP32) { @@ -337,7 +338,8 @@ NameVarMap AutoCastInputs(const std::string& op_type, } return new_ins; } else { - auto dst_type = GetPromoteType(op_type, ins); + auto dst_type = + GetPromoteType(op_type, ins, framework::proto::VarType::FP16); // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32. if (dst_type == framework::proto::VarType::FP16 && @@ -435,7 +437,7 @@ NameVarMap AutoCastBF16Inputs(const std::string& op_type, } } return new_ins; - } else { + } else if (AmpOperators::Instance().GetMutableBlockOps()->count(op_type)) { for (auto& pair : new_ins) { VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " << GetDtypeStr(*pair.second.cbegin()) << " to float"; @@ -444,6 +446,26 @@ NameVarMap AutoCastBF16Inputs(const std::string& op_type, } } return new_ins; + } else { + auto dst_type = + GetPromoteType(op_type, ins, framework::proto::VarType::BF16); + // NOTE(zhangbo): if the op has op fp16 kernel, fall back to fp32. + if (dst_type == framework::proto::VarType::BF16 && + AmpOperators::Instance().GetMutableUnsupportedBf16Ops()->count( + op_type)) { + dst_type = framework::proto::VarType::FP32; + } + for (auto& pair : new_ins) { + VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " + << GetDtypeStr(*pair.second.cbegin()) << " to " + << framework::DataTypeToString(dst_type); + for (auto& var : pair.second) { + var = (dst_type == framework::proto::VarType::FP32 + ? CastToFP32(var) + : CastToBF16(var)); + } + } + return new_ins; } return new_ins; } diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 03811ac778779..c832787d98906 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -205,17 +205,19 @@ void Tracer::TraceOp(const std::string& type, const NameVarMap& ins, NameVarMap new_ins = ins; if (amp_level_ == AmpLevel::O1) { - VLOG(5) << "Auto mixed precision run operator: " << type; if (amp_dtype_ == phi::DataType::FLOAT16) { + VLOG(5) << "Float16 Auto Mixed Precision O1 run operator: " << type; new_ins = AutoCastInputs(type, ins); } else if (amp_dtype_ == phi::DataType::BFLOAT16) { + VLOG(5) << "BFloat16 Auto Mixed Precision O1 run operator: " << type; new_ins = AutoCastBF16Inputs(type, ins); } } else if (amp_level_ == AmpLevel::O2) { - VLOG(5) << "Pure fp16 run operator: " << type; if (amp_dtype_ == phi::DataType::FLOAT16) { + VLOG(5) << "Float16 Auto Mixed Precision O2 run operator: " << type; new_ins = CastPureFp16Inputs(type, ins); } else if (amp_dtype_ == phi::DataType::BFLOAT16) { + VLOG(5) << "BFloat16 Auto Mixed Precision O2 run operator: " << type; new_ins = CastPureBf16Inputs(type, ins); } } diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index 8230e4bbd7774..f43a51063b00a 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -75,7 +75,7 @@ 'lookup_table', 'lookup_table_v2', 'scatter', 'scatter_grad' } -BF16_WHITE_LIST = {'conv2d'} +BF16_WHITE_LIST = {'conv2d', 'matmul_v2'} BF16_BLACK_LIST = {' '} _g_amp_state_ = None diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py index 67c4bb3b2c746..5cb72512f99af 100644 --- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py +++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py @@ -1131,20 +1131,29 @@ class TestBf16(unittest.TestCase): test amp for BF16 ''' - def train(self, enable_amp=True): + def train(self, enable_amp=True, amp_level='O1'): paddle.seed(100) input = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.) conv = paddle.nn.Conv2D(4, 6, (3, 3)) with paddle.amp.auto_cast( - enable=enable_amp, level='O2', dtype='bfloat16'): + enable=enable_amp, level=amp_level, dtype='bfloat16'): output = conv(input) output = output.cast('float32') return output.numpy() def test_bf16(self): - out_fp32 = self.train(enable_amp=False) - out_bf16 = self.train(enable_amp=True) - self.assertTrue(np.allclose(out_fp32, out_bf16, rtol=1.e-3, atol=1.e-1)) + if fluid.core.is_compiled_with_cuda(): + cudnn_version = paddle.device.get_cudnn_version() + if cudnn_version is not None and cudnn_version >= 8100: + out_fp32 = self.train(enable_amp=False) + out_bf16_O1 = self.train(enable_amp=True, amp_level='O1') + out_bf16_O2 = self.train(enable_amp=True, amp_level='O2') + self.assertTrue( + np.allclose( + out_fp32, out_bf16_O1, rtol=1.e-3, atol=1.e-1)) + self.assertTrue( + np.allclose( + out_fp32, out_bf16_O2, rtol=1.e-3, atol=1.e-1)) class TestPyLayerWithAmp(unittest.TestCase): From 406f1b9650e072abf04ac7572b24d89399d98343 Mon Sep 17 00:00:00 2001 From: liutiexing <74819124+liutiexing@users.noreply.github.com> Date: Mon, 28 Feb 2022 11:25:55 +0800 Subject: [PATCH 81/85] Update host tracer (#39975) * add align for WorkQueue * add spinlock * merge develop * merge * Add EventsWaiter * Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. * update HostTracer * fix * update * update Co-authored-by: liutiexing --- .../details/fast_threaded_ssa_graph_executor.cc | 2 ++ paddle/fluid/platform/os_info.cc | 4 +--- paddle/fluid/platform/os_info.h | 3 ++- paddle/fluid/platform/profiler/host_tracer.cc | 11 ++++++++++- paddle/fluid/platform/profiler/host_tracer.h | 8 ++++---- .../fluid/platform/profiler/trace_event_collector.h | 11 +++++++++++ 6 files changed, 30 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 1cf69a1a3d652..1b2b24762894c 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -231,6 +231,8 @@ void FastThreadedSSAGraphExecutor::RunOpAsync( OpHandleBase *op, const std::shared_ptr> &complete_q) { ++remaining_; + platform::RecordEvent("WorkQueue::AddTask", + platform::TracerEventType::UserDefined, 10 /*level*/); this->pool_->enqueue([=] { std::deque op_queue; op_queue.push_front(op); diff --git a/paddle/fluid/platform/os_info.cc b/paddle/fluid/platform/os_info.cc index 58d37783d0597..36dd7891d5518 100644 --- a/paddle/fluid/platform/os_info.cc +++ b/paddle/fluid/platform/os_info.cc @@ -95,8 +95,6 @@ std::unordered_map GetAllThreadIds() { return res; } -static constexpr const char* kDefaultThreadName = "unset"; - std::string GetCurrentThreadName() { const auto& thread_name = internal::ThreadDataRegistry::GetInstance() @@ -112,7 +110,7 @@ std::unordered_map GetAllThreadNames() { bool SetCurrentThreadName(const std::string& name) { auto& instance = internal::ThreadDataRegistry::GetInstance(); const auto& cur_name = instance.GetCurrentThreadData(); - if (!cur_name.empty() || cur_name == kDefaultThreadName) { + if (!cur_name.empty() || name.empty() || name == kDefaultThreadName) { return false; } instance.SetCurrentThreadData(name); diff --git a/paddle/fluid/platform/os_info.h b/paddle/fluid/platform/os_info.h index 7f607aaec9763..ef894fd3dc281 100644 --- a/paddle/fluid/platform/os_info.h +++ b/paddle/fluid/platform/os_info.h @@ -57,7 +57,8 @@ ThreadId GetCurrentThreadId(); // create/destory when using it. std::unordered_map GetAllThreadIds(); -// Returns 'unset' if SetCurrentThreadName is never called. +static constexpr const char* kDefaultThreadName = "unset"; +// Returns kDefaultThreadName if SetCurrentThreadName is never called. std::string GetCurrentThreadName(); // Return the map from StdTid to ThreadName diff --git a/paddle/fluid/platform/profiler/host_tracer.cc b/paddle/fluid/platform/profiler/host_tracer.cc index 2172fe4d1e3d5..3f97113aecbf5 100644 --- a/paddle/fluid/platform/profiler/host_tracer.cc +++ b/paddle/fluid/platform/profiler/host_tracer.cc @@ -26,6 +26,9 @@ void ProcessHostEvents(const HostEventSection& host_events, TraceEventCollector* collector) { for (const auto& thr_sec : host_events.thr_sections) { uint64_t tid = thr_sec.thread_id; + if (thr_sec.thread_name != kDefaultThreadName) { + collector->AddThreadName(tid, thr_sec.thread_name); + } for (const auto& evt : thr_sec.events) { HostTraceEvent event; event.name = evt.name; @@ -41,12 +44,18 @@ void ProcessHostEvents(const HostEventSection& host_events, } // namespace +void HostTracer::PrepareTracing() { + // warm up + HostTraceLevel::GetInstance().SetLevel(options_.trace_level); + state_ = TracerState::READY; +} + void HostTracer::StartTracing() { PADDLE_ENFORCE_EQ( state_ == TracerState::READY || state_ == TracerState::STOPED, true, platform::errors::PreconditionNotMet("TracerState must be READY")); HostEventRecorder::GetInstance().GatherEvents(); - HostTraceLevel::GetInstance().SetLevel(trace_level_); + HostTraceLevel::GetInstance().SetLevel(options_.trace_level); state_ = TracerState::STARTED; } diff --git a/paddle/fluid/platform/profiler/host_tracer.h b/paddle/fluid/platform/profiler/host_tracer.h index b6c10e558b787..d05e829357f88 100644 --- a/paddle/fluid/platform/profiler/host_tracer.h +++ b/paddle/fluid/platform/profiler/host_tracer.h @@ -45,9 +45,9 @@ struct HostTracerOptions { class HostTracer : public TracerBase { public: - explicit HostTracer(const HostTracerOptions& options) { - trace_level_ = options.trace_level; - } + explicit HostTracer(const HostTracerOptions& options) : options_(options) {} + + void PrepareTracing() override; void StartTracing() override; @@ -56,7 +56,7 @@ class HostTracer : public TracerBase { void CollectTraceData(TraceEventCollector* collector) override; private: - uint32_t trace_level_; + HostTracerOptions options_; }; } // namespace platform diff --git a/paddle/fluid/platform/profiler/trace_event_collector.h b/paddle/fluid/platform/profiler/trace_event_collector.h index 30b32220d9f84..cc85a178d14e5 100644 --- a/paddle/fluid/platform/profiler/trace_event_collector.h +++ b/paddle/fluid/platform/profiler/trace_event_collector.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include +#include +#include #include "paddle/fluid/platform/profiler/trace_event.h" namespace paddle { @@ -32,6 +34,10 @@ class TraceEventCollector { device_events_.push_back(event); } + void AddThreadName(uint64_t tid, const std::string& name) { + thread_names_[tid] = name; + } + const std::list& HostEvents() const { return host_events_; } const std::list& RuntimeEvents() const { @@ -42,7 +48,12 @@ class TraceEventCollector { return device_events_; } + const std::unordered_map& ThreadNames() const { + return thread_names_; + } + private: + std::unordered_map thread_names_; std::list host_events_; std::list runtime_events_; std::list device_events_; From eb42dd52e40dc35016b0e2de614896ed7f982640 Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 28 Feb 2022 12:45:03 +0800 Subject: [PATCH 82/85] [Pten->Phi PR4] Rename pten in funcs to phi (#39961) * rename pten_utils to phi_utils * rename pten_utils target * rename Pten to Phi * replace pten with phi * resolve conflict --- .../distributed/ps/service/brpc_utils.cc | 4 +- .../eager_generated/backwards/scale_node.cc | 20 +-- paddle/fluid/eager/api/utils/tensor_utils.cc | 4 +- .../auto_code_generator/eager_generator.cc | 4 +- paddle/fluid/eager/eager_tensor.h | 6 +- paddle/fluid/eager/utils.cc | 2 +- paddle/fluid/framework/CMakeLists.txt | 6 +- paddle/fluid/framework/async_executor.cc | 2 +- paddle/fluid/framework/convert_utils.cc | 2 +- paddle/fluid/framework/convert_utils.h | 2 +- paddle/fluid/framework/convert_utils_test.cc | 22 ++-- paddle/fluid/framework/custom_operator.cc | 6 +- .../framework/data_device_transform_test.cu | 2 +- paddle/fluid/framework/data_type_test.cc | 4 +- .../fluid/framework/executor_thread_worker.cc | 2 +- paddle/fluid/framework/fleet/heter_wrapper.cc | 4 +- .../fluid/framework/heter_section_worker.cc | 2 +- paddle/fluid/framework/infershape_utils.cc | 12 +- .../fuse_optimizer_op_pass.cc | 6 +- .../conv_bias_mkldnn_fuse_pass_tester.cc | 2 +- .../ir/mkldnn/cpu_quantize_pass_tester.cc | 2 +- .../mkldnn/cpu_quantize_squash_pass_tester.cc | 2 +- paddle/fluid/framework/lod_tensor.cc | 2 +- .../framework/new_executor/interpretercore.cc | 10 +- .../new_executor/interpretercore_util.cc | 28 ++--- .../new_executor/new_executor_defs.cc | 4 +- .../new_executor/new_executor_defs.h | 4 +- paddle/fluid/framework/operator.cc | 64 +++++----- paddle/fluid/framework/operator.h | 52 ++++---- .../cinn_graph_symbolization_test.cc | 2 +- .../framework/{pten_utils.cc => phi_utils.cc} | 45 ++++--- .../framework/{pten_utils.h => phi_utils.h} | 13 +- .../{pten_utils_test.cc => phi_utils_test.cc} | 20 +-- paddle/fluid/framework/tensor_util.cc | 2 +- paddle/fluid/imperative/amp_auto_cast.cc | 6 +- paddle/fluid/imperative/basic_engine.cc | 2 +- .../fluid/imperative/gradient_accumulator.cc | 8 +- .../fluid/imperative/partial_grad_engine.cc | 6 +- paddle/fluid/imperative/prepared_operator.cc | 28 ++--- paddle/fluid/imperative/prepared_operator.h | 30 ++--- paddle/fluid/imperative/reducer.cc | 4 +- paddle/fluid/imperative/tests/test_group.cc | 2 +- paddle/fluid/inference/io.cc | 2 +- paddle/fluid/inference/lite/tensor_utils.cc | 4 +- paddle/fluid/operators/benchmark/op_tester.cc | 2 +- paddle/fluid/operators/cast_op.cc | 2 +- paddle/fluid/operators/cast_op.h | 4 +- paddle/fluid/operators/cast_op_xpu.cc | 6 +- paddle/fluid/operators/cholesky_solve_op.h | 2 +- paddle/fluid/operators/coalesce_tensor_op.cc | 4 +- paddle/fluid/operators/conj_op.h | 2 +- paddle/fluid/operators/dot_op.h | 6 +- .../elementwise/elementwise_add_op.h | 2 +- .../elementwise/elementwise_div_op.h | 8 +- .../elementwise/elementwise_mul_op.cu | 6 +- .../elementwise/elementwise_mul_op.h | 8 +- .../elementwise/elementwise_op_broadcast.cu.h | 6 +- .../elementwise/elementwise_op_impl.cu.h | 8 +- .../elementwise/elementwise_sub_op.h | 6 +- paddle/fluid/operators/empty_op.h | 2 +- .../fluid/operators/fill_any_like_op_npu.cc | 2 +- .../fluid/operators/fill_any_like_op_xpu.cc | 4 +- .../fill_constant_batch_size_like_op.h | 4 +- .../fill_constant_batch_size_like_op_npu.cc | 6 +- paddle/fluid/operators/fill_constant_op.h | 8 +- .../fluid/operators/fill_constant_op_npu.cc | 2 +- paddle/fluid/operators/fill_op.h | 6 +- paddle/fluid/operators/flatten_op.h | 10 +- .../fused_embedding_eltwise_layernorm_op.cu | 4 +- paddle/fluid/operators/lu_op.h | 14 +-- .../operators/metrics/accuracy_op_mlu.cc | 14 +-- paddle/fluid/operators/mlu/mlu_baseop.h | 2 +- paddle/fluid/operators/reduce_ops/reduce_op.h | 16 +-- paddle/fluid/operators/reshape_op.cc | 2 +- paddle/fluid/operators/scale_op_xpu.cc | 4 +- .../softmax_with_cross_entropy_op_mlu.cc | 2 +- paddle/fluid/operators/top_k_op_mlu.cc | 2 +- paddle/fluid/operators/top_k_v2_op_mlu.cc | 2 +- .../operators/uniform_random_inplace_op.cu | 2 +- .../fluid/platform/device/ipu/ipu_executor.cc | 2 +- .../platform/device/npu/npu_op_runner.cc | 4 +- paddle/fluid/platform/mkldnn_reuse.h | 4 +- paddle/fluid/platform/transform.h | 2 +- paddle/fluid/pybind/eager.cc | 6 +- paddle/fluid/pybind/eager_functions.cc | 2 +- paddle/fluid/pybind/eager_method.cc | 2 +- .../pybind/eager_op_function_generator.cc | 6 +- .../pybind/kernel_signature_generator.cc | 2 +- paddle/fluid/pybind/op_function_generator.cc | 6 +- paddle/fluid/pybind/pybind.cc | 116 +++++++++--------- paddle/fluid/pybind/tensor_py.h | 6 +- paddle/phi/api/all.h | 4 +- paddle/phi/api/include/tensor.h | 9 +- paddle/phi/api/lib/api_custom_impl.cc | 2 +- paddle/phi/api/lib/api_registry.h | 4 +- paddle/phi/api/lib/api_utils.h | 4 +- paddle/phi/api/lib/data_transform.cc | 6 +- paddle/phi/api/lib/kernel_dispatch.cc | 6 +- paddle/phi/api/lib/sparse_api.cc | 12 +- paddle/phi/api/lib/tensor.cc | 2 +- paddle/phi/api/lib/tensor_method.cc | 2 +- paddle/phi/api/lib/utils/tensor_utils.cc | 20 +-- paddle/phi/api/lib/utils/tensor_utils.h | 11 +- paddle/phi/backends/all_context.h | 2 +- paddle/phi/backends/cpu/cpu_context.h | 2 +- paddle/phi/backends/gpu/gpu_context.cc | 2 +- paddle/phi/backends/gpu/rocm/rocm_info.cc | 2 +- paddle/phi/backends/xpu/xpu_info.cc | 2 +- paddle/phi/common/layout.h | 2 +- paddle/phi/common/place.cc | 2 +- paddle/phi/core/compat/arg_map_context.h | 2 +- paddle/phi/core/compat/convert_utils.cc | 14 +-- paddle/phi/core/compat/convert_utils.h | 8 +- paddle/phi/core/custom_kernel.cc | 2 +- paddle/phi/core/dense_tensor_impl.cc | 2 +- paddle/phi/core/kernel_factory.h | 4 +- paddle/phi/core/utils/data_type.h | 44 +++---- paddle/phi/kernels/diagonal_kernel.h | 2 +- paddle/phi/kernels/digamma_grad_kernel.h | 2 +- paddle/phi/kernels/digamma_kernel.h | 2 +- paddle/phi/kernels/expand_kernel.h | 2 +- .../phi/kernels/masked_select_grad_kernel.h | 2 +- paddle/phi/kernels/masked_select_kernel.h | 2 +- paddle/phi/kernels/transfer_layout_kernel.cc | 2 +- paddle/phi/ops/compat/scale_sig.cc | 2 +- paddle/phi/tests/api/scale_api.h | 4 +- paddle/phi/tests/api/test_data_transform.cc | 2 +- paddle/phi/tests/api/test_pten_tensor.cc | 2 +- paddle/phi/tests/common/test_place.cc | 2 +- paddle/phi/tests/core/test_custom_kernel.cc | 8 +- .../fluid/tests/custom_op/custom_linear_op.cc | 8 +- .../test_get_all_registered_op_kernels.py | 10 +- python/setup.py.in | 12 +- 133 files changed, 510 insertions(+), 520 deletions(-) rename paddle/fluid/framework/{pten_utils.cc => phi_utils.cc} (85%) rename paddle/fluid/framework/{pten_utils.h => phi_utils.h} (87%) rename paddle/fluid/framework/{pten_utils_test.cc => phi_utils_test.cc} (84%) diff --git a/paddle/fluid/distributed/ps/service/brpc_utils.cc b/paddle/fluid/distributed/ps/service/brpc_utils.cc index 9f2a8eb24533d..2009ec772e1cf 100644 --- a/paddle/fluid/distributed/ps/service/brpc_utils.cc +++ b/paddle/fluid/distributed/ps/service/brpc_utils.cc @@ -238,7 +238,7 @@ void DeserializeLodTensor(framework::Variable* var, const VarMsg& msg, void* tensor_data = tensor->mutable_data( place, - framework::TransToPtenDataType(VarMessageToVarType(msg.data_type()))); + framework::TransToPhiDataType(VarMessageToVarType(msg.data_type()))); // IO Buffer if (platform::is_cpu_place(place)) { @@ -281,7 +281,7 @@ void DeserializeSelectedRows( tensor->Resize(phi::make_ddim(vec_dim)); void* tensor_data = tensor->mutable_data( place, - framework::TransToPtenDataType(VarMessageToVarType(msg.data_type()))); + framework::TransToPhiDataType(VarMessageToVarType(msg.data_type()))); // IO Buffer if (platform::is_cpu_place(place)) { unsigned long data_len; // NOLINT diff --git a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc index 3dbfba0d9150f..5a2595b9103e4 100644 --- a/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc +++ b/paddle/fluid/eager/api/generated/eager_generated/backwards/scale_node.cc @@ -33,36 +33,36 @@ static void ScaleDeviceDispatch(const phi::DenseTensor& dense_tensor, phi::DenseTensor* dense_out) { switch (dense_tensor.dtype()) { case phi::DataType::FLOAT64: { - phi::ScaleKernel::TYPE>( - static_cast::TYPE&>(dev_ctx), dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); break; } case phi::DataType::FLOAT32: { - phi::ScaleKernel::TYPE>( - static_cast::TYPE&>(dev_ctx), dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); break; } case phi::DataType::INT64: { - phi::ScaleKernel::TYPE>( - static_cast::TYPE>( + static_cast::TYPE&>(dev_ctx), dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); break; } case phi::DataType::INT32: { - phi::ScaleKernel::TYPE>( - static_cast::TYPE>( + static_cast::TYPE&>(dev_ctx), dense_tensor /* tensor */, scale /* scale */, bias /* bias */, bias_after_scale /* bias_after_scale */, dense_out /* out tensor */); diff --git a/paddle/fluid/eager/api/utils/tensor_utils.cc b/paddle/fluid/eager/api/utils/tensor_utils.cc index 628c0c500b3c4..77c39d1b0a37c 100644 --- a/paddle/fluid/eager/api/utils/tensor_utils.cc +++ b/paddle/fluid/eager/api/utils/tensor_utils.cc @@ -22,7 +22,7 @@ #include "paddle/phi/api/all.h" #include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/variable.h" namespace egr { @@ -43,7 +43,7 @@ paddle::experimental::Tensor CreateTensorWithValue( bool is_leaf) { paddle::experimental::Tensor out = paddle::experimental::full( phi::vectorize(ddim), paddle::experimental::Scalar(value), dtype, - phi::TransToPtenBackend(place)); + phi::TransToPhiBackend(place)); auto meta = EagerUtils::autograd_meta(&out); if (is_leaf) { diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 74c5bcdb20984..a8e0ed7a41a04 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/string/string_helper.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" #define NUM_CREATED_DUP_INPUTS 4 @@ -544,7 +544,7 @@ static bool CheckOpProto(proto::OpProto* op_proto) { // since only OperatorWithKernel can run in dygraph mode. auto& all_kernels = paddle::framework::OperatorWithKernel::AllOpKernels(); if (!all_kernels.count(op_type) && - !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) { + !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) { return false; } diff --git a/paddle/fluid/eager/eager_tensor.h b/paddle/fluid/eager/eager_tensor.h index 42a3a13e5f70a..41e57ef1a15b0 100644 --- a/paddle/fluid/eager/eager_tensor.h +++ b/paddle/fluid/eager/eager_tensor.h @@ -14,10 +14,10 @@ #pragma once // framework deps -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" -// pten deps +// Phi deps #include "paddle/phi/api/include/tensor.h" #include "paddle/phi/api/lib/api_declare.h" #include "paddle/phi/api/lib/utils/tensor_utils.h" @@ -31,7 +31,7 @@ * provide variable in * paddle::framework::ExecutionContext to support it. We should remove this as * soon as we finish our latest - * Pten Lib, and use paddle::experimental::Tensor instead. + * Phi Lib, and use paddle::experimental::Tensor instead. * * Note: Keep this class as clean as possible. * This class should only support method declared in diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc index 7464ad7413585..a7e5931f1f9bc 100644 --- a/paddle/fluid/eager/utils.cc +++ b/paddle/fluid/eager/utils.cc @@ -23,7 +23,7 @@ #include "paddle/phi/core/tensor_meta.h" #include "paddle/fluid/framework/data_layout.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/variable.h" PADDLE_DEFINE_EXPORTED_bool(retain_grad_for_all_tensor, true, diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 082c508174332..14aecb5fd43c4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -193,9 +193,9 @@ cc_library(unused_var_check SRCS unused_var_check.cc DEPS glog no_need_buffer_va cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) IF(WITH_XPU) -cc_library(phi_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list) +cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info xpu_op_list) ELSE() -cc_library(phi_utils SRCS pten_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info) +cc_library(phi_utils SRCS phi_utils.cc DEPS lod_tensor selected_rows_utils place phi var_type_traits phi_api_utils op_info) ENDIF() IF(WITH_XPU) @@ -450,7 +450,7 @@ if(WITH_TESTING AND TEST selected_rows_utils_test) endif() cc_test(scope_guard_test SRCS scope_guard_test.cc) -cc_test(phi_utils_test SRCS pten_utils_test.cc DEPS phi_utils) +cc_test(phi_utils_test SRCS phi_utils_test.cc DEPS phi_utils) if(WITH_GPU OR WITH_ROCM) cc_library(fluid_convert_utils SRCS convert_utils.cc DEPS data_type place gpu_info) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index 81b6917587df9..ae3d8379bdbf7 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -33,7 +33,7 @@ limitations under the License. */ #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" namespace paddle { diff --git a/paddle/fluid/framework/convert_utils.cc b/paddle/fluid/framework/convert_utils.cc index 23cf4324086bd..df5cc6d82042c 100644 --- a/paddle/fluid/framework/convert_utils.cc +++ b/paddle/fluid/framework/convert_utils.cc @@ -18,7 +18,7 @@ limitations under the License. */ namespace paddle { namespace framework { -paddle::experimental::DataType TransToPtenDataType( +paddle::experimental::DataType TransToPhiDataType( const paddle::framework::proto::VarType::Type& dtype) { // Set the order of case branches according to the frequency with // the data type is used diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h index c94b5b2311c52..da2af86c77c47 100644 --- a/paddle/fluid/framework/convert_utils.h +++ b/paddle/fluid/framework/convert_utils.h @@ -32,7 +32,7 @@ namespace framework { using DataType = paddle::experimental::DataType; using DataLayout = paddle::experimental::DataLayout; -DataType TransToPtenDataType( +DataType TransToPhiDataType( const paddle::framework::proto::VarType::Type& dtype); paddle::framework::proto::VarType::Type TransToProtoVarType( diff --git a/paddle/fluid/framework/convert_utils_test.cc b/paddle/fluid/framework/convert_utils_test.cc index 51b431f4b4a8a..140806dfd7c5e 100644 --- a/paddle/fluid/framework/convert_utils_test.cc +++ b/paddle/fluid/framework/convert_utils_test.cc @@ -43,35 +43,35 @@ TEST(ConvertUtils, DataType) { CHECK(paddle::framework::TransToProtoVarType(paddle::DataType::FLOAT16) == paddle::framework::proto::VarType::FP16); // proto -> enum - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::FP64) == paddle::DataType::FLOAT64); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::FP32) == paddle::DataType::FLOAT32); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::INT64) == paddle::DataType::INT64); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::INT32) == paddle::DataType::INT32); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::INT8) == paddle::DataType::INT8); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::UINT8) == paddle::DataType::UINT8); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::INT16) == paddle::DataType::INT16); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::BOOL) == paddle::DataType::BOOL); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::COMPLEX64) == paddle::DataType::COMPLEX64); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::COMPLEX128) == paddle::DataType::COMPLEX128); - CHECK(paddle::framework::TransToPtenDataType( + CHECK(paddle::framework::TransToPhiDataType( paddle::framework::proto::VarType::FP16) == paddle::DataType::FLOAT16); } diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 597265bb2473f..b9e3bee25f6b5 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_meta_info_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/string/string_helper.h" @@ -779,13 +779,13 @@ void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos, for (size_t i = 0; i < ctx->InputSize(in_name); ++i) { auto dtype = ctx->GetInputDataType(in_name, i); vec_custom_dtype.emplace_back( - paddle::framework::TransToPtenDataType(dtype)); + paddle::framework::TransToPhiDataType(dtype)); } vec_input_dtypes.emplace_back(vec_custom_dtype); } else { auto dtype = ctx->GetInputDataType(in_name); input_dtypes.emplace_back( - paddle::framework::TransToPtenDataType(dtype)); + paddle::framework::TransToPhiDataType(dtype)); } } diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu index cf9e3de6c1a58..4757eb60f4361 100644 --- a/paddle/fluid/framework/data_device_transform_test.cu +++ b/paddle/fluid/framework/data_device_transform_test.cu @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/platform/init.h" #include "paddle/phi/kernels/funcs/math_function.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 7152004b63de6..15cf30c1cf352 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -28,7 +28,7 @@ TEST(DataType, float16) { Tensor tensor; CPUPlace cpu; - tensor.mutable_data(cpu, f::TransToPtenDataType(dtype)); + tensor.mutable_data(cpu, f::TransToPhiDataType(dtype)); // test fp16 tensor EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()), @@ -51,7 +51,7 @@ TEST(DataType, bfloat16) { Tensor tensor; CPUPlace cpu; - tensor.mutable_data(cpu, f::TransToPtenDataType(dtype)); + tensor.mutable_data(cpu, f::TransToPhiDataType(dtype)); // test bf16 tensor EXPECT_EQ(f::TransToProtoVarType(tensor.dtype()), diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 47ab1e0fc030a..06019372a7323 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -34,7 +34,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/pybind/pybind.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/heter_wrapper.cc b/paddle/fluid/framework/fleet/heter_wrapper.cc index 8c3c1e015262b..84dcdad78298a 100644 --- a/paddle/fluid/framework/fleet/heter_wrapper.cc +++ b/paddle/fluid/framework/fleet/heter_wrapper.cc @@ -161,7 +161,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, tensor->set_lod(lod); void* tensor_data = tensor->mutable_data( - place, framework::TransToPtenDataType(ToVarType(req_var.data_type()))); + place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(), @@ -202,7 +202,7 @@ void HeterWrapper::DeSerializeToTensor(Scope* scope, tensor->set_lod(lod); void* tensor_data = tensor->mutable_data( - place, framework::TransToPtenDataType(ToVarType(req_var.data_type()))); + place, framework::TransToPhiDataType(ToVarType(req_var.data_type()))); #ifdef PADDLE_WITH_XPU memory::Copy(place, tensor_data, platform::CPUPlace(), req_var.data().data(), diff --git a/paddle/fluid/framework/heter_section_worker.cc b/paddle/fluid/framework/heter_section_worker.cc index 8aafd3459ed1a..b6759bb2e6fe6 100644 --- a/paddle/fluid/framework/heter_section_worker.cc +++ b/paddle/fluid/framework/heter_section_worker.cc @@ -38,7 +38,7 @@ void SetMicroId(paddle::framework::Scope* scope, std::vector dims{1}; tensor->Resize(phi::make_ddim(dims)); void* tensor_data = tensor->mutable_data( - place, framework::TransToPtenDataType(framework::proto::VarType::FP32)); + place, framework::TransToPhiDataType(framework::proto::VarType::FP32)); if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA std::vector temp; diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc index 0900ed2ff2f5d..e14b91d935d05 100644 --- a/paddle/fluid/framework/infershape_utils.cc +++ b/paddle/fluid/framework/infershape_utils.cc @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/framework.pb.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/phi/common/scalar.h" #include "paddle/phi/common/scalar_array.h" @@ -144,7 +144,7 @@ class CompatMetaTensor : public phi::MetaTensor { } } else { auto* var = BOOST_GET_CONST(VarDesc*, var_); - return paddle::framework::TransToPtenDataType(var->GetDataType()); + return paddle::framework::TransToPhiDataType(var->GetDataType()); } } @@ -341,10 +341,10 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, } if (infershape_inputs.size() != 1) { infer_meta_context.EmplaceBackAttr( - std::move(experimental::MakePtenScalarArrayFromVarList(vars))); + std::move(experimental::MakePhiScalarArrayFromVarList(vars))); } else { infer_meta_context.EmplaceBackAttr( - std::move(experimental::MakePtenScalarArrayFromVar(*vars[0]))); + std::move(experimental::MakePhiScalarArrayFromVar(*vars[0]))); } } else { // If is not in runtime, we will set default value(-1) for ScalarArray @@ -419,7 +419,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, if (ctx->IsRuntime()) { Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]); infer_meta_context.EmplaceBackAttr( - std::move(experimental::MakePtenScalarFromVar(*var))); + std::move(experimental::MakePhiScalarFromVar(*var))); } else { phi::Scalar tensor_scalar(-1); tensor_scalar.SetFromTensor(true); @@ -481,7 +481,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx, BOOST_GET_CONST(std::vector, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { - auto data_type = paddle::framework::TransToPtenDataType( + auto data_type = paddle::framework::TransToPhiDataType( static_cast( BOOST_GET_CONST(int, attr))); infer_meta_context.EmplaceBackAttr(data_type); diff --git a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc index ec5d48b3093f7..26ee02ff1812d 100644 --- a/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/ir/fuse_optimizer_ops_pass/fuse_optimizer_op_pass.cc @@ -276,13 +276,13 @@ bool FuseOptimizerOpPass::OpWithKernelSupportCPUAndGPU( bool support_gpu = false; auto &kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = - kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type)); + kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); bool has_op_kernel = kernel_key_map.size() > 0 ? true : false; for (auto &kernel : kernel_key_map) { - if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) { + if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { support_gpu = true; } else if (platform::is_cpu_place( - phi::TransToPtenPlace(kernel.first.backend()))) { + phi::TransToPhiPlace(kernel.first.backend()))) { support_cpu = true; } } diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc index dafcc9c4e16a3..e9850483ebe91 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc @@ -96,7 +96,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, auto x = scope->Var(var_name); auto tensor = x->GetMutable(); tensor->mutable_data(place, - framework::TransToPtenDataType(proto::VarType::FP32), 1); + framework::TransToPhiDataType(proto::VarType::FP32), 1); } void MainTest(bool convWithExistingBias) { diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 3a78c229bd8fa..889417b78c864 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -126,7 +126,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, auto x = scope->Var(var_name); auto tensor = x->GetMutable(); tensor->mutable_data(place, - framework::TransToPtenDataType(proto::VarType::FP32), 1); + framework::TransToPhiDataType(proto::VarType::FP32), 1); } void PreparePass(std::unique_ptr* graph, const ProgramDesc& prog, diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc index e00bb84e35c09..0506bfaf447ac 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc @@ -526,7 +526,7 @@ void InitTensorHolder(Scope* scope, const paddle::platform::Place& place, auto x = scope->Var(var_name); auto tensor = x->GetMutable(); tensor->mutable_data(place, - framework::TransToPtenDataType(proto::VarType::FP32), 1); + framework::TransToPhiDataType(proto::VarType::FP32), 1); } void PrepareGraph(std::unique_ptr* graph, const ProgramDesc& prog) { diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index a9e0b9c98b46f..56f9e6842373b 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -447,7 +447,7 @@ void MergeLoDTensor(LoDTensor *target, target->set_layout(new_layout); target->set_lod(new_lod); target->mutable_data(dst_place, - paddle::framework::TransToPtenDataType(new_type)); + paddle::framework::TransToPhiDataType(new_type)); int begin = 0; for (auto *src : lod_tensors) { diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 766a3b9e495d5..9b597a9efde8a 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -416,18 +416,18 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { if (op_with_kernel == nullptr) { instr_node.OpBase()->Run(*local_scope, place_); } else { - // fit for pten - if (instr_node.PtenKernel() && instr_node.PtenKernel()->IsValid()) { - VLOG(4) << "Run pten kernel: " << op->Type(); + // fit for phi + if (instr_node.PhiKernel() && instr_node.PhiKernel()->IsValid()) { + VLOG(4) << "Run phi kernel: " << op->Type(); VLOG(4) << instr_node.InnerRuntimeContext().get() << " " << &instr_node.DeviceContext(); phi::KernelContext pt_kernel_context; - op_with_kernel->BuildPtenKernelContext( + op_with_kernel->BuildPhiKernelContext( *instr_node.InnerRuntimeContext().get(), const_cast(&instr_node.DeviceContext()), &pt_kernel_context); - (*instr_node.PtenKernel())(&pt_kernel_context); + (*instr_node.PhiKernel())(&pt_kernel_context); } else { instr_node.KernelFunc()(*instr_node.InnerExecutionContext().get()); diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 0767dde4392b8..d595af58257d4 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -407,14 +407,14 @@ void build_op_func_list(const platform::Place& place, auto exec_ctx = ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context); - auto run_pten_kernel = false; - if (phi::KernelFactory::Instance().HasCompatiblePtenKernel( + auto run_phi_kernel = false; + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel( op_with_kernel->Type())) { - auto pt_kernel_key = op_with_kernel->ChoosePtenKernel(exec_ctx); - auto pt_kernel_name = op_with_kernel->PtenKernelSignature()->name; + auto pt_kernel_key = op_with_kernel->ChoosePhiKernel(exec_ctx); + auto pt_kernel_name = op_with_kernel->PhiKernelSignature()->name; - if (op_with_kernel->PtenKernel()->IsValid()) { - run_pten_kernel = true; + if (op_with_kernel->PhiKernel()->IsValid()) { + run_phi_kernel = true; } else { auto kernels_iter = all_op_kernels.find(op_with_kernel->Type()); if (kernels_iter == all_op_kernels.end() || @@ -422,26 +422,26 @@ void build_op_func_list(const platform::Place& place, kernels_iter->second.end()) { auto pt_cpu_kernel_key = FallBackToCpu( expected_kernel_key, pt_kernel_key, *op_with_kernel); - op_with_kernel->ResetPtenKernel( + op_with_kernel->ResetPhiKernel( new phi::Kernel(phi::KernelFactory::Instance().SelectKernel( pt_kernel_name, pt_cpu_kernel_key))); - if (op_with_kernel->PtenKernel()->IsValid()) { + if (op_with_kernel->PhiKernel()->IsValid()) { VLOG(6) << "Static mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_cpu_kernel_key - << " | kernel: " << *(op_with_kernel->PtenKernel()); - run_pten_kernel = true; + << " | kernel: " << *(op_with_kernel->PhiKernel()); + run_phi_kernel = true; } } } } VLOG(3) << op_with_kernel->Type() << " : expected_kernel_key : " << expected_kernel_key; - if (run_pten_kernel) { + if (run_phi_kernel) { phi::KernelContext pt_kernel_context; - op_with_kernel->BuildPtenKernelContext(runtime_context, dev_ctx, - &pt_kernel_context); - op_func_node.pt_kernel_ = op_with_kernel->PtenKernel(); + op_with_kernel->BuildPhiKernelContext(runtime_context, dev_ctx, + &pt_kernel_context); + op_func_node.pt_kernel_ = op_with_kernel->PhiKernel(); (*op_func_node.pt_kernel_)(&pt_kernel_context); } else { diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc index 1fbe4500ac6df..35bac43931703 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.cc +++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc @@ -688,9 +688,7 @@ OpKernelComputeFunc Instruction::KernelFunc() const { return op_func_node_.kernel_func_; } -phi::Kernel* Instruction::PtenKernel() const { - return op_func_node_.pt_kernel_; -} +phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; } OpFuncType Instruction::KernelType() const { return op_func_node_.type_; } diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 93b9aee4f32cb..dc34bd2c69411 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -300,7 +300,7 @@ struct OpFuncNode { OpKernelComputeFunc kernel_func_; platform::DeviceContext* dev_ctx_; // not owned - // fit for pten kernel + // fit for phi kernel phi::Kernel* pt_kernel_{nullptr}; // not owned OpFuncType type_; @@ -321,7 +321,7 @@ class Instruction { OpKernelComputeFunc KernelFunc() const; - phi::Kernel* PtenKernel() const; + phi::Kernel* PhiKernel() const; OpFuncType KernelType() const; diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b7332896818c9..d33791f70c4d2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/op_call_stack.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/shape_inference.h" #include "paddle/fluid/framework/transfer_scope_cache.h" #include "paddle/fluid/framework/unused_var_check.h" @@ -616,9 +616,9 @@ bool OpSupportGPU(const std::string& op_type) { // check in new Function kernel first auto& kernel_factory = phi::KernelFactory::Instance(); auto kernel_key_map = - kernel_factory.SelectKernelMap(phi::TransToPtenKernelName(op_type)); + kernel_factory.SelectKernelMap(phi::TransToPhiKernelName(op_type)); for (auto& kernel : kernel_key_map) { - if (platform::is_gpu_place(phi::TransToPtenPlace(kernel.first.backend()))) { + if (platform::is_gpu_place(phi::TransToPhiPlace(kernel.first.backend()))) { return true; } } @@ -1186,10 +1186,10 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // phase phi::KernelKey pt_kernel_key; std::string pt_kernel_name; - if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(type_)) { + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(type_)) { if (pt_kernel_signature_ == nullptr || pt_kernel_ == nullptr) { pt_kernel_signature_.reset( - new KernelSignature(std::move(GetExpectedPtenKernelArgs(exe_ctx)))); + new KernelSignature(std::move(GetExpectedPhiKernelArgs(exe_ctx)))); VLOG(6) << *pt_kernel_signature_.get(); kernel_type_.reset( @@ -1197,17 +1197,17 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(kernel_type_->place_); pt_kernel_name = pt_kernel_signature_->name; - pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get()); + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); pt_kernel_.reset( new phi::Kernel(phi::KernelFactory::Instance().SelectKernel( pt_kernel_name, pt_kernel_key))); if (pt_kernel_->IsValid()) { - VLOG(6) << "Static mode ChoosePtenKernel - kernel name: " + VLOG(6) << "Static mode ChoosePhiKernel - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << *pt_kernel_; } else { - VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name + VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } } @@ -1222,7 +1222,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, && !is_xpu_unsupport #endif ) { - run_pten_kernel_ = true; + run_phi_kernel_ = true; } else { auto& all_op_kernels = AllOpKernels(); auto kernels_iter = all_op_kernels.find(type_); @@ -1244,12 +1244,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope, VLOG(6) << "Static mode PrepareImpl - kernel name: " << pt_kernel_name << " | kernel key: " << pt_cpu_kernel_key << " | kernel: " << *pt_kernel_; - run_pten_kernel_ = true; + run_phi_kernel_ = true; } } } } - if (!run_pten_kernel_) { + if (!run_phi_kernel_) { if (kernel_type_.get() == nullptr || kernel_func_.get() == nullptr) { ChooseKernel(exe_ctx); dev_ctx = pool.Get(kernel_type_->place_); @@ -1290,13 +1290,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, platform::RecordEvent record_event("compute", platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); - if (run_pten_kernel_) { + if (run_phi_kernel_) { phi::KernelContext pt_kernel_context; // Do data transform before building KernelContext // TODO(zhiqiu): support TransferInplaceVarsBack - PreparePtenData(exec_scope, *pt_kernel_, *pt_kernel_signature_, - runtime_ctx); - BuildPtenKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context); + PreparePhiData(exec_scope, *pt_kernel_, *pt_kernel_signature_, + runtime_ctx); + BuildPhiKernelContext(*runtime_ctx, dev_ctx, &pt_kernel_context); (*pt_kernel_)(&pt_kernel_context); } else { (*kernel_func_)( @@ -1388,26 +1388,26 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType( return expected_kernel_key; } -phi::KernelKey OperatorWithKernel::ChoosePtenKernel( +phi::KernelKey OperatorWithKernel::ChoosePhiKernel( const ExecutionContext& ctx) const { pt_kernel_signature_.reset( - new KernelSignature(std::move(GetExpectedPtenKernelArgs(ctx)))); + new KernelSignature(std::move(GetExpectedPhiKernelArgs(ctx)))); VLOG(6) << *pt_kernel_signature_.get(); kernel_type_.reset( new OpKernelType(std::move(InnerGetExpectedKernelType(ctx)))); auto pt_kernel_name = pt_kernel_signature_->name; - auto pt_kernel_key = TransOpKernelTypeToPtenKernelKey(*kernel_type_.get()); + auto pt_kernel_key = TransOpKernelTypeToPhiKernelKey(*kernel_type_.get()); pt_kernel_.reset(new phi::Kernel(phi::KernelFactory::Instance().SelectKernel( pt_kernel_name, pt_kernel_key))); if (pt_kernel_->IsValid()) { - VLOG(6) << "Static mode ChoosePtenKernel - kernel name: " << pt_kernel_name + VLOG(6) << "Static mode ChoosePhiKernel - kernel name: " << pt_kernel_name << " | kernel key: " << pt_kernel_key << " | kernel: " << *pt_kernel_; } else { - VLOG(6) << "Static mode ChoosePtenKernel - kernel `" << pt_kernel_name + VLOG(6) << "Static mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } return pt_kernel_key; @@ -1918,7 +1918,7 @@ OpKernelType OperatorWithKernel::GetKernelTypeForVar( tensor.layout()); } -KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs( +KernelSignature OperatorWithKernel::GetExpectedPhiKernelArgs( const ExecutionContext& ctx) const { InitDefaultKernelSignatureMap(); ExecutionArgumentMappingContext arg_mapping_ctx(ctx); @@ -1926,7 +1926,7 @@ KernelSignature OperatorWithKernel::GetExpectedPtenKernelArgs( arg_mapping_ctx); } -Scope* OperatorWithKernel::PreparePtenData( +Scope* OperatorWithKernel::PreparePhiData( const Scope& scope, const phi::Kernel& pt_kernel, const KernelSignature& pt_kernel_signature, RuntimeContext* ctx) const { auto& input_names = std::get<0>(pt_kernel_signature.args); @@ -1981,12 +1981,12 @@ Scope* OperatorWithKernel::PreparePtenData( if (in_def.backend == phi::Backend::ALL_BACKEND) { continue; } - auto expected_place = phi::TransToPtenPlace(in_def.backend); + auto expected_place = phi::TransToPhiPlace(in_def.backend); if (platform::is_same_place(tensor_in->place(), expected_place)) { continue; } - VLOG(3) << "PTen Transform Variable " << input_names[i] << " from " + VLOG(3) << "phi Transform Variable " << input_names[i] << " from " << tensor_in->place() << " to " << expected_place; if (!new_scope) { @@ -2007,7 +2007,7 @@ Scope* OperatorWithKernel::PreparePtenData( return new_scope; } -void OperatorWithKernel::BuildPtenKernelContext( +void OperatorWithKernel::BuildPhiKernelContext( const RuntimeContext& ctx, platform::DeviceContext* dev_ctx, phi::KernelContext* pt_kernel_context) const { pt_kernel_context->SetDeviceContext(dev_ctx); @@ -2111,7 +2111,7 @@ void OperatorWithKernel::BuildPtenKernelContext( experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, output_defs.at(i)); SetAllocationForOutputTenosr( - tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend)); + tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out); } @@ -2145,10 +2145,10 @@ void OperatorWithKernel::BuildPtenKernelContext( auto& ins_vector = ctx.inputs.at(attr_names[i]); if (ins_vector.size() == 1) { // ShapeTensor pt_kernel_context->EmplaceBackAttr(std::move( - experimental::MakePtenScalarArrayFromVar(*ins_vector.front()))); + experimental::MakePhiScalarArrayFromVar(*ins_vector.front()))); } else { // ShapeTensorList pt_kernel_context->EmplaceBackAttr(std::move( - experimental::MakePtenScalarArrayFromVarList(ins_vector))); + experimental::MakePhiScalarArrayFromVarList(ins_vector))); } } } else if (attr_defs[i].type_index == @@ -2178,8 +2178,8 @@ void OperatorWithKernel::BuildPtenKernelContext( } } else { auto& ins_vector = ctx.inputs.at(attr_names[i]); - pt_kernel_context->EmplaceBackAttr(std::move( - experimental::MakePtenScalarFromVar(*ins_vector.front()))); + pt_kernel_context->EmplaceBackAttr( + std::move(experimental::MakePhiScalarFromVar(*ins_vector.front()))); } } else { @@ -2198,7 +2198,7 @@ void OperatorWithKernel::BuildPtenKernelContext( pt_kernel_context->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { - auto data_type = paddle::framework::TransToPtenDataType( + auto data_type = paddle::framework::TransToPhiDataType( static_cast( BOOST_GET_CONST(int, attr))); pt_kernel_context->EmplaceBackAttr(data_type); @@ -2206,7 +2206,7 @@ void OperatorWithKernel::BuildPtenKernelContext( std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { - // Emplace Back Attr according to the type of Pten_Kernel args. + // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index ff9cb8a287a26..16718a316513e 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -30,7 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_kernel_type.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/tensor.h" @@ -423,7 +423,7 @@ class ExecutionContext { "size(%d).", allocation_ptr->size(), phi::product(dim) * sizeof(T))); - paddle::framework::Tensor temp_tensor(framework::TransToPtenDataType( + paddle::framework::Tensor temp_tensor(framework::TransToPhiDataType( framework::ToDataType(std::type_index(typeid(T))))); temp_tensor.Resize(dim); temp_tensor.ResetHolder(std::move(shared_allocation)); @@ -538,14 +538,14 @@ class OperatorWithKernel : public OperatorBase { } bool SupportGPU() const override { - auto pten_kernels = phi::KernelFactory::Instance().SelectKernelMap( - phi::TransToPtenKernelName(type_)); - auto has_pten_kernel = - std::any_of(pten_kernels.begin(), pten_kernels.end(), + auto phi_kernels = phi::KernelFactory::Instance().SelectKernelMap( + phi::TransToPhiKernelName(type_)); + auto has_phi_kernel = + std::any_of(phi_kernels.begin(), phi_kernels.end(), [](phi::KernelKeyMap::const_reference kern_pair) { return kern_pair.first.backend() == phi::Backend::GPU; }); - if (has_pten_kernel) { + if (has_phi_kernel) { return true; } else { auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); @@ -558,7 +558,7 @@ class OperatorWithKernel : public OperatorBase { } bool SupportNPU() const override { - // TODO(zhiqiu): support pten if needed? + // TODO(zhiqiu): support phi if needed? auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); return std::any_of(op_kernels.begin(), op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { @@ -566,7 +566,7 @@ class OperatorWithKernel : public OperatorBase { }); } bool SupportMLU() const override { - // TODO(zhiqiu): support pten if needed? + // TODO(zhiqiu): support phi if needed? auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_); return std::any_of(op_kernels.begin(), op_kernels.end(), [](OpKernelMap::const_reference kern_pair) { @@ -603,39 +603,39 @@ class OperatorWithKernel : public OperatorBase { return kernel_type_->place_; } - /* member functions for adapting to pten lib */ + /* member functions for adapting to phi lib */ /** In the Tensor calculation library, the new Kernel adopts a clearer and * more streamlined design. The arguments of the Kernel and the input and * output arguments registered in the original OpMaker do not match in some * cases, so we use map to record the arguments required by the kernel. * When selecting Kernel during Op execution, select the arguments of the - * original Op according to the GetExpectedPtenKernelArgs returned arguments. + * original Op according to the GetExpectedPhiKernelArgs returned arguments. */ - phi::KernelSignature GetExpectedPtenKernelArgs( + phi::KernelSignature GetExpectedPhiKernelArgs( const ExecutionContext& ctx) const; - /* member functions for adapting to pten lib */ - phi::KernelKey ChoosePtenKernel(const ExecutionContext& ctx) const; + /* member functions for adapting to phi lib */ + phi::KernelKey ChoosePhiKernel(const ExecutionContext& ctx) const; /** - * Transfer data place for pten kernel + * Transfer data place for phi kernel * Is this really needed? */ - Scope* PreparePtenData(const Scope& scope, const phi::Kernel& pt_kernel, - const phi::KernelSignature& pt_kernel_signature, - RuntimeContext* ctx) const; + Scope* PreparePhiData(const Scope& scope, const phi::Kernel& pt_kernel, + const phi::KernelSignature& pt_kernel_signature, + RuntimeContext* ctx) const; - void BuildPtenKernelContext(const RuntimeContext& ctx, - platform::DeviceContext* dev_ctx, - phi::KernelContext* pt_kernel_context) const; + void BuildPhiKernelContext(const RuntimeContext& ctx, + platform::DeviceContext* dev_ctx, + phi::KernelContext* pt_kernel_context) const; - phi::KernelSignature* PtenKernelSignature() const { + phi::KernelSignature* PhiKernelSignature() const { return pt_kernel_signature_.get(); } - phi::Kernel* PtenKernel() const { return pt_kernel_.get(); } + phi::Kernel* PhiKernel() const { return pt_kernel_.get(); } - void ResetPtenKernel(phi::Kernel* kernel) const { + void ResetPhiKernel(phi::Kernel* kernel) const { return pt_kernel_.reset(kernel); } @@ -692,9 +692,9 @@ class OperatorWithKernel : public OperatorBase { mutable std::mutex cache_update_mutex_; mutable bool enable_cache_transfer_scope_ = false; // NOTE(chenweihang): Similar op members are used to adapt to - // new pten kernel, if there is a better design in the future, + // new phi kernel, if there is a better design in the future, // we may polish the implementation here - mutable bool run_pten_kernel_ = false; + mutable bool run_phi_kernel_ = false; mutable bool run_kp_kernel = false; mutable std::unique_ptr pt_kernel_signature_; mutable std::unique_ptr pt_kernel_; diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc index 09bca4a735461..c0e1ca8f0d123 100644 --- a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc @@ -209,7 +209,7 @@ class CinnGraphSymbolizationTest : public ::testing::Test { tensor.Resize(dims); tensor.mutable_data( platform::CPUPlace(), - framework::TransToPtenDataType(framework::proto::VarType::FP32)); + framework::TransToPhiDataType(framework::proto::VarType::FP32)); return tensor; }; #define FillFeedList(Name) feed_targets[#Name] = create_tensor(); diff --git a/paddle/fluid/framework/pten_utils.cc b/paddle/fluid/framework/phi_utils.cc similarity index 85% rename from paddle/fluid/framework/pten_utils.cc rename to paddle/fluid/framework/phi_utils.cc index af9d62ff7a845..355291beb60f9 100644 --- a/paddle/fluid/framework/pten_utils.cc +++ b/paddle/fluid/framework/phi_utils.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/convert_utils.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_info.h" @@ -57,12 +57,11 @@ class KernelArgsNameMakerByOpProto : public KernelArgsNameMaker { paddle::SmallVector attr_names_; }; -OpKernelType TransPtenKernelKeyToOpKernelType( - const phi::KernelKey& kernel_key) { +OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) { proto::VarType::Type data_type = paddle::framework::TransToProtoVarType(kernel_key.dtype()); // no need to set current device id here - platform::Place place = phi::TransToPtenPlace(kernel_key.backend(), false); + platform::Place place = phi::TransToPhiPlace(kernel_key.backend(), false); DataLayout data_layout = kernel_key.layout(); LibraryType library_type = LibraryType::kPlain; if (kernel_key.backend() == phi::Backend::MKLDNN) { @@ -76,9 +75,9 @@ OpKernelType TransPtenKernelKeyToOpKernelType( return OpKernelType(data_type, place, data_layout, library_type); } -phi::KernelKey TransOpKernelTypeToPtenKernelKey( +phi::KernelKey TransOpKernelTypeToPhiKernelKey( const OpKernelType& kernel_type) { - phi::Backend backend = phi::TransToPtenBackend(kernel_type.place_); + phi::Backend backend = phi::TransToPhiBackend(kernel_type.place_); if (kernel_type.library_type_ == LibraryType::kMKLDNN) { backend = phi::Backend::MKLDNN; } else if (kernel_type.library_type_ == LibraryType::kCUDNN) { @@ -88,7 +87,7 @@ phi::KernelKey TransOpKernelTypeToPtenKernelKey( } paddle::experimental::DataLayout layout = kernel_type.data_layout_; paddle::experimental::DataType dtype = - paddle::framework::TransToPtenDataType(kernel_type.data_type_); + paddle::framework::TransToPhiDataType(kernel_type.data_type_); return phi::KernelKey(backend, layout, dtype); } @@ -98,8 +97,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #ifdef PADDLE_WITH_XPU if (platform::is_xpu_place(expected_kernel_key.place_) || paddle::platform::is_in_xpu_black_list(op.Type())) { - VLOG(3) << "pten missing XPU kernel: " << op.Type() - << ", expected_kernel_key:" << expected_kernel_key + VLOG(3) << "phi missing XPU kernel: " << op.Type() + << "phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -107,8 +106,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #endif #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(expected_kernel_key.place_)) { - VLOG(3) << "pten missing NPU kernel: " << op.Type() - << ", expected_kernel_key:" << expected_kernel_key + VLOG(3) << "phi missing NPU kernel: " << op.Type() + << "phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -116,8 +115,8 @@ phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, #endif #ifdef PADDLE_WITH_MLU if (platform::is_mlu_place(expected_kernel_key.place_)) { - VLOG(3) << "pten missing MLU kernel: " << op.Type() - << ", expected_kernel_key:" << expected_kernel_key + VLOG(3) << "phi missing MLU kernel: " << op.Type() + << "phipected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; return phi::KernelKey(phi::Backend::CPU, kernel_key.layout(), kernel_key.dtype()); @@ -132,17 +131,17 @@ KernelArgsNameMakerByOpProto::GetInputArgsNames() { auto& in = op_proto_->inputs()[i]; auto& in_name = in.name(); if ((in.has_extra() && in.extra()) || (in.has_quant() && in.quant())) { - VLOG(6) << "Parse PtenKernel input: skip extra & quant input - " + VLOG(6) << "Parse PhiKernel input: skip extra & quant input - " << in_name; continue; } // If contains dispensable input, we should override the // OpArgumentMapping method self in phi/ops/compat dir if (in.has_dispensable() && in.dispensable()) { - VLOG(6) << "Parse PtenKernel input: skip dispensable input - " << in_name; + VLOG(6) << "Parse PhiKernel input: skip dispensable input - " << in_name; continue; } - VLOG(6) << "Parse PtenKernel input: " << in_name; + VLOG(6) << "Parse PhiKernel input: " << in_name; input_names_.emplace_back(in_name); } return input_names_; @@ -154,11 +153,11 @@ KernelArgsNameMakerByOpProto::GetOutputArgsNames() { auto& out = op_proto_->outputs()[i]; auto& out_name = out.name(); if ((out.has_extra() && out.extra()) || (out.has_quant() && out.quant())) { - VLOG(6) << "Parse PtenKernel output: skip extra & quant output - " + VLOG(6) << "Parse PhiKernel output: skip extra & quant output - " << out_name; continue; } - VLOG(6) << "Parse PtenKernel output: " << out_name; + VLOG(6) << "Parse PhiKernel output: " << out_name; output_names_.emplace_back(out_name); } return output_names_; @@ -173,17 +172,17 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { attr_name == "op_role" || attr_name == "op_role_var" || attr_name == "op_namescope" || attr_name == "op_callstack" || attr_name == "op_device") { - VLOG(6) << "Parse PtenKernel attribute: skip needless attr - " + VLOG(6) << "Parse PhiKernel attribute: skip needless attr - " << attr_name; continue; } if ((attr.has_extra() && attr.extra()) || (attr.has_quant() && attr.quant())) { - VLOG(6) << "Parse PtenKernel attribute: skip extra & quant attr - " + VLOG(6) << "Parse PhiKernel attribute: skip extra & quant attr - " << attr_name; continue; } - VLOG(6) << "Parse PtenKernel attribute: " << attr_name; + VLOG(6) << "Parse PhiKernel attribute: " << attr_name; attr_names_.emplace_back(attr_name); } @@ -191,7 +190,7 @@ KernelArgsNameMakerByOpProto::GetAttrsArgsNames() { } KernelSignature KernelArgsNameMakerByOpProto::GetKernelSignature() { - return KernelSignature(phi::TransToPtenKernelName(op_proto_->type()), + return KernelSignature(phi::TransToPhiKernelName(op_proto_->type()), GetInputArgsNames(), GetAttrsArgsNames(), GetOutputArgsNames()); } @@ -203,7 +202,7 @@ void InitDefaultKernelSignatureMap() { for (const auto& pair : paddle::framework::OpInfoMap::Instance().map()) { const auto& op_type = pair.first; const auto* op_proto = pair.second.proto_; - if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type) && + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type) && op_proto) { paddle::framework::KernelArgsNameMakerByOpProto maker(op_proto); VLOG(10) << "Register kernel signature for " << op_type; diff --git a/paddle/fluid/framework/pten_utils.h b/paddle/fluid/framework/phi_utils.h similarity index 87% rename from paddle/fluid/framework/pten_utils.h rename to paddle/fluid/framework/phi_utils.h index 1bcffbcc31435..1a1f79d827700 100644 --- a/paddle/fluid/framework/pten_utils.h +++ b/paddle/fluid/framework/phi_utils.h @@ -44,9 +44,8 @@ using KernelSignature = phi::KernelSignature; /* Kernel Key translate */ -OpKernelType TransPtenKernelKeyToOpKernelType(const phi::KernelKey& kernel_key); -phi::KernelKey TransOpKernelTypeToPtenKernelKey( - const OpKernelType& kernel_type); +OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key); +phi::KernelKey TransOpKernelTypeToPhiKernelKey(const OpKernelType& kernel_type); phi::KernelKey FallBackToCpu(const OpKernelType& expected_kernel_key, const phi::KernelKey& kernel_key, const framework::OperatorBase& op); @@ -68,25 +67,25 @@ void SetAllocationForOutputTenosr(phi::TensorBase* tensor, // TODO(Wilber): support others device context. template -struct ConvertToPtenContext { +struct ConvertToPhiContext { using TYPE = T; }; template <> -struct ConvertToPtenContext { +struct ConvertToPhiContext { using TYPE = phi::CPUContext; }; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template <> -struct ConvertToPtenContext { +struct ConvertToPhiContext { using TYPE = phi::GPUContext; }; #endif #ifdef PADDLE_WITH_XPU template <> -struct ConvertToPtenContext { +struct ConvertToPhiContext { using TYPE = phi::XPUContext; }; #endif diff --git a/paddle/fluid/framework/pten_utils_test.cc b/paddle/fluid/framework/phi_utils_test.cc similarity index 84% rename from paddle/fluid/framework/pten_utils_test.cc rename to paddle/fluid/framework/phi_utils_test.cc index da1431c0efafe..cbcdf24c9f32b 100644 --- a/paddle/fluid/framework/pten_utils_test.cc +++ b/paddle/fluid/framework/phi_utils_test.cc @@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/variable.h" -TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { +TEST(PhiUtils, TransPhiKernelKeyToOpKernelType) { phi::KernelKey kernel_key(phi::Backend::CPU, phi::DataLayout::NCHW, phi::DataType::FLOAT32); auto op_kernel_type = - paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key); + paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key); ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32); ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW); ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_)); @@ -33,7 +33,7 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { phi::KernelKey kernel_key_mkldnn(phi::Backend::MKLDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32); op_kernel_type = - paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_mkldnn); + paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_mkldnn); ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32); ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW); ASSERT_TRUE(paddle::platform::is_cpu_place(op_kernel_type.place_)); @@ -45,7 +45,7 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { phi::KernelKey kernel_key_cudnn(phi::Backend::GPUDNN, phi::DataLayout::NCHW, phi::DataType::FLOAT32); op_kernel_type = - paddle::framework::TransPtenKernelKeyToOpKernelType(kernel_key_cudnn); + paddle::framework::TransPhiKernelKeyToOpKernelType(kernel_key_cudnn); ASSERT_EQ(op_kernel_type.data_type_, paddle::framework::proto::VarType::FP32); ASSERT_EQ(op_kernel_type.data_layout_, paddle::framework::DataLayout::kNCHW); ASSERT_TRUE(paddle::platform::is_gpu_place(op_kernel_type.place_)); @@ -54,12 +54,12 @@ TEST(PtenUtils, TransPtenKernelKeyToOpKernelType) { #endif } -TEST(PtenUtils, TransOpKernelTypeToPtenKernelKey) { +TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) { paddle::framework::OpKernelType op_kernel_type( paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), paddle::framework::DataLayout::kNCHW); auto kernel_key = - paddle::framework::TransOpKernelTypeToPtenKernelKey(op_kernel_type); + paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type); ASSERT_EQ(kernel_key.dtype(), phi::DataType::FLOAT32); ASSERT_EQ(kernel_key.layout(), phi::DataLayout::NCHW); ASSERT_EQ(kernel_key.backend(), phi::Backend::CPU); @@ -69,8 +69,8 @@ TEST(PtenUtils, TransOpKernelTypeToPtenKernelKey) { paddle::framework::proto::VarType::FP32, paddle::platform::CPUPlace(), paddle::framework::DataLayout::kMKLDNN, paddle::framework::LibraryType::kMKLDNN); - auto kernel_key_mkldnn = paddle::framework::TransOpKernelTypeToPtenKernelKey( - op_kernel_type_mkldnn); + auto kernel_key_mkldnn = + paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn); ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32); ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN); ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::MKLDNN); @@ -82,7 +82,7 @@ TEST(PtenUtils, TransOpKernelTypeToPtenKernelKey) { paddle::framework::DataLayout::kNCHW, paddle::framework::LibraryType::kCUDNN); auto kernel_key_cudnn = - paddle::framework::TransOpKernelTypeToPtenKernelKey(op_kernel_type_cudnn); + paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_cudnn); ASSERT_EQ(kernel_key_cudnn.dtype(), phi::DataType::FLOAT32); ASSERT_EQ(kernel_key_cudnn.layout(), phi::DataLayout::NCHW); ASSERT_EQ(kernel_key_cudnn.backend(), phi::Backend::GPUDNN); diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 10eefff093b0e..10ceae62dccbb 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1457,7 +1457,7 @@ std::ostream& print_tensor>( std::ostream& operator<<(std::ostream& os, const LoD& lod) { // NOTE(xiongkun): // https://stackoverflow.com/questions/5195512/namespaces-and-operator-resolution - // if we don't redefine, the operator << of pten / framework LoD is not found. + // if we don't redefine, the operator << of phi / framework LoD is not found. paddle::string::operator<<(os, lod); return os; } diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 6e8bfbb4a7761..149202468be6c 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -70,12 +70,12 @@ OpSupportedInfos(const std::string& place, } } - auto pten_kernels = phi::KernelFactory::Instance().kernels(); - for (auto& kernel_pair : pten_kernels) { + auto phi_kernels = phi::KernelFactory::Instance().kernels(); + for (auto& kernel_pair : phi_kernels) { auto op_type = phi::TransToFluidOpName(kernel_pair.first); for (auto& info_pair : kernel_pair.second) { framework::OpKernelType kernel_type = - framework::TransPtenKernelKeyToOpKernelType(info_pair.first); + framework::TransPhiKernelKeyToOpKernelType(info_pair.first); if (is_target_place[query_place](kernel_type.place_) && kernel_type.data_type_ == dtype && all_ops.count(op_type)) { VLOG(4) << op_type << " " << supported_ops.size(); diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index 97a188e5c9c27..8373c7fe50d02 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -154,7 +154,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) { // Here, we use the type of the corresponding forward datatype. tensor->mutable_data( - op.place(), framework::TransToPtenDataType(var->ForwardDataType())); + op.place(), framework::TransToPhiDataType(var->ForwardDataType())); VLOG(6) << "Set ungenerated Grad: " << var->Name() << " as zero with dtype " << framework::DataTypeToString(var->ForwardDataType()); diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index 3587736a851da..0abc5ad90e269 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -791,13 +791,13 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr var, << var->Var().Get().dims(); tensor->Resize(var->Var().Get().dims()); tensor->mutable_data(place, - framework::TransToPtenDataType(var->DataType())); + framework::TransToPhiDataType(var->DataType())); phi::funcs::set_constant(*dev_ctx, tensor, 0.0); } else { auto* tensor = dst_var->MutableVar()->GetMutable(); tensor->mutable_data(place, - framework::TransToPtenDataType(var->DataType())); + framework::TransToPhiDataType(var->DataType())); phi::funcs::set_constant(*dev_ctx, tensor, 0.0); } } @@ -925,13 +925,13 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr var, << var->Var().Get().dims(); tensor->Resize(var->Var().Get().dims()); tensor->mutable_data(place, - framework::TransToPtenDataType(var->DataType())); + framework::TransToPhiDataType(var->DataType())); phi::funcs::set_constant(*dev_ctx, tensor, 0.0); } else { auto* tensor = dst_var->MutableVar()->GetMutable(); tensor->mutable_data(place, - framework::TransToPtenDataType(var->DataType())); + framework::TransToPhiDataType(var->DataType())); phi::funcs::set_constant(*dev_ctx, tensor, 0.0); } } diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index f1d0c8afdd50e..56ddbf3386198 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -314,10 +314,10 @@ static void FillConstantLike(const VariableWrapper &ref_var, // default data_type for now. if (ref_var.ForwardDataType() != -1) { dst_tensor->mutable_data( - place, framework::TransToPtenDataType(ref_var.ForwardDataType())); + place, framework::TransToPhiDataType(ref_var.ForwardDataType())); } else { - dst_tensor->mutable_data( - place, framework::TransToPtenDataType(ref_var.DataType())); + dst_tensor->mutable_data(place, + framework::TransToPhiDataType(ref_var.DataType())); } phi::funcs::set_constant(*dev_ctx, dst_tensor, value); } diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 6d18b0a86f091..9dd1dacc02c25 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -121,7 +121,7 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op, kernel_type_(kernel_type), func_(nullptr), dev_ctx_(dev_ctx), - run_pten_kernel_(true), + run_phi_kernel_(true), pt_kernel_signature_(kernel_signature), pt_kernel_(pt_kernel) {} @@ -151,7 +151,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, #endif // NOTE(zhiqiu): for kernels on given device, for example NPU, the order to // choose is: - // pten npu kernel > fluid npu kernel > pten cpu kernel > fluid cpu kernel + // phi npu kernel > fluid npu kernel > phi cpu kernel > fluid cpu kernel // 1. get expected kernel key auto dygraph_exe_ctx = DygraphExecutionContext( @@ -168,12 +168,12 @@ PreparedOp PrepareImpl(const NameVarMap& ins, expected_kernel_key) || paddle::platform::is_in_xpu_black_list(op.Type()); #endif - if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { - pt_kernel_signature = op.GetExpectedPtenKernelArgs(dygraph_exe_ctx); + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { + pt_kernel_signature = op.GetExpectedPhiKernelArgs(dygraph_exe_ctx); VLOG(6) << pt_kernel_signature; pt_kernel_name = pt_kernel_signature.name; - pt_kernel_key = TransOpKernelTypeToPtenKernelKey(expected_kernel_key); + pt_kernel_key = TransOpKernelTypeToPhiKernelKey(expected_kernel_key); auto pt_kernel = phi::KernelFactory::Instance().SelectKernel(pt_kernel_name, pt_kernel_key); @@ -195,7 +195,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, return PreparedOp(op, ctx, expected_kernel_key, pt_kernel_signature, pt_kernel, dev_ctx); } else { - VLOG(6) << "Dynamic mode ChoosePtenKernel - kernel `" << pt_kernel_name + VLOG(6) << "Dynamic mode ChoosePhiKernel - kernel `" << pt_kernel_name << "` not found."; } } @@ -211,7 +211,7 @@ PreparedOp PrepareImpl(const NameVarMap& ins, || is_xpu_unsupport #endif ) { - if (phi::KernelFactory::Instance().HasCompatiblePtenKernel(op.Type())) { + if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(op.Type())) { auto pt_cpu_kernel_key = FallBackToCpu(expected_kernel_key, pt_kernel_key, op); auto pt_cpu_kernel = phi::KernelFactory::Instance().SelectKernel( @@ -423,12 +423,12 @@ static void PreparedOpRunPtImpl( platform::TracerEventType::OperatorInner, 1, platform::EventRole::kInnerOp); - PreparePtenData(pt_kernel, pt_kernel_signature, ins); + PreparePhiData(pt_kernel, pt_kernel_signature, ins); phi::KernelContext pt_kernel_context; - BuildDygraphPtenKernelContext(pt_kernel_signature, pt_kernel, ins, - outs, attrs, default_attrs, dev_ctx, - &pt_kernel_context); + BuildDygraphPhiKernelContext(pt_kernel_signature, pt_kernel, ins, + outs, attrs, default_attrs, dev_ctx, + &pt_kernel_context); pt_kernel(&pt_kernel_context); } @@ -451,7 +451,7 @@ void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - if (run_pten_kernel_) { + if (run_phi_kernel_) { PreparedOpRunPtImpl(op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); @@ -465,7 +465,7 @@ void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - if (run_pten_kernel_) { + if (run_phi_kernel_) { PreparedOpRunPtImpl( op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); @@ -479,7 +479,7 @@ void PreparedOp::Run(const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, const framework::AttributeMap& default_attrs) { - if (run_pten_kernel_) { + if (run_phi_kernel_) { PreparedOpRunPtImpl( op_, kernel_type_, pt_kernel_signature_, pt_kernel_, dev_ctx_, ins, outs, attrs, default_attrs); diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h index 879b3ec3e68a2..8e1e2fbe9a12d 100644 --- a/paddle/fluid/imperative/prepared_operator.h +++ b/paddle/fluid/imperative/prepared_operator.h @@ -22,7 +22,7 @@ #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/op_kernel_type.h" #include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/type_defs.h" #include "paddle/fluid/imperative/execution_context.h" #include "paddle/fluid/imperative/layer.h" @@ -201,9 +201,9 @@ class PreparedOp { framework::OperatorWithKernel::OpKernelFunc func_; platform::DeviceContext* dev_ctx_; // NOTE(chenweihang): Similar op members are used to adapt to - // new pten kernel, if there is a better design in the future, + // new phi kernel, if there is a better design in the future, // we may polish the implementation here - bool run_pten_kernel_{false}; + bool run_phi_kernel_{false}; bool run_kp_kernel_{false}; framework::KernelSignature pt_kernel_signature_; phi::Kernel pt_kernel_; @@ -225,7 +225,7 @@ const inline framework::Attribute& GetAttr( } template -void BuildDygraphPtenKernelContext( +void BuildDygraphPhiKernelContext( const framework::KernelSignature& pt_kernel_signature, const phi::Kernel& pt_kernel, const NameVarMap& ins, const NameVarMap& outs, const framework::AttributeMap& attrs, @@ -327,7 +327,7 @@ void BuildDygraphPtenKernelContext( experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out, output_defs.at(i)); framework::SetAllocationForOutputTenosr( - tensor_out, phi::TransToPtenPlace(output_defs.at(i).backend)); + tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend)); kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out); } @@ -369,7 +369,7 @@ void BuildDygraphPtenKernelContext( auto& ins_vector = ins.at(attr_names[i]); if (ins_vector.size() == 1) { // ShapeTensor kernel_ctx->EmplaceBackAttr(std::move( - experimental::MakePtenScalarArrayFromVar(ins_vector[0]->Var()))); + experimental::MakePhiScalarArrayFromVar(ins_vector[0]->Var()))); } else { // ShapeTensorList std::vector variables; variables.reserve(ins_vector.size()); @@ -377,7 +377,7 @@ void BuildDygraphPtenKernelContext( variables.push_back(var_base->MutableVar()); } kernel_ctx->EmplaceBackAttr(std::move( - experimental::MakePtenScalarArrayFromVarList(variables))); + experimental::MakePhiScalarArrayFromVarList(variables))); } } } else if (attr_defs[i].type_index == @@ -409,7 +409,7 @@ void BuildDygraphPtenKernelContext( } else { // scalar is in the input auto& ins_vector = ins.at(attr_names[i]); kernel_ctx->EmplaceBackAttr(std::move( - experimental::MakePtenScalarFromVar(ins_vector[0]->Var()))); + experimental::MakePhiScalarFromVar(ins_vector[0]->Var()))); } } else { @@ -428,7 +428,7 @@ void BuildDygraphPtenKernelContext( kernel_ctx->EmplaceBackAttr(BOOST_GET_CONST(std::string, attr)); } else if (attr_defs[i].type_index == std::type_index(typeid(phi::DataType))) { - auto data_type = framework::TransToPtenDataType( + auto data_type = framework::TransToPhiDataType( static_cast( BOOST_GET_CONST(int, attr))); kernel_ctx->EmplaceBackAttr(data_type); @@ -436,7 +436,7 @@ void BuildDygraphPtenKernelContext( std::type_index(typeid(std::vector))) { if (std::type_index(attr.type()) == std::type_index(typeid(std::vector))) { - // Emplace Back Attr according to the type of Pten_Kernel args. + // Emplace Back Attr according to the type of Phi_Kernel args. const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); const std::vector vector_int64_attr(vector_int_attr.begin(), vector_int_attr.end()); @@ -456,9 +456,9 @@ void BuildDygraphPtenKernelContext( } template -void PreparePtenData(const phi::Kernel& pt_kernel, - const framework::KernelSignature& pt_kernel_signature, - const NameVarMap& ins) { +void PreparePhiData(const phi::Kernel& pt_kernel, + const framework::KernelSignature& pt_kernel_signature, + const NameVarMap& ins) { auto& input_names = std::get<0>(pt_kernel_signature.args); auto& input_defs = pt_kernel.args_def().input_defs(); @@ -482,12 +482,12 @@ void PreparePtenData(const phi::Kernel& pt_kernel, if (in_def.backend == phi::Backend::ALL_BACKEND) { continue; } - auto expected_place = phi::TransToPtenPlace(in_def.backend); + auto expected_place = phi::TransToPhiPlace(in_def.backend); if (platform::is_same_place(tensor_in->place(), expected_place)) { continue; } - VLOG(3) << "Pten Transform Variable " << input_names[i] << " from " + VLOG(3) << "Phi Transform Variable " << input_names[i] << " from " << tensor_in->place() << " to " << expected_place; framework::Tensor tmp_tensor; diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index 8681382394b9e..3a6365b2af21a 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -446,7 +446,7 @@ void Reducer::InitializeGroups( InitializeDenseGroups(variable_indices_, &group); auto tensor = group.dense_contents_.GetMutable(); tensor->Resize(phi::make_ddim({group.all_length_})) - .mutable_data(place_, framework::TransToPtenDataType(group.dtype_)); + .mutable_data(place_, framework::TransToPhiDataType(group.dtype_)); } // map variables to this group by VariableLocator @@ -738,7 +738,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) { if (!group_tensor.IsInitialized()) { group_tensor.Resize({static_cast(length)}); group_tensor.mutable_data(place_, - framework::TransToPtenDataType(group.dtype_)); + framework::TransToPhiDataType(group.dtype_)); } #ifdef PADDLE_WITH_XPU_BKCL diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc index bca7ecc5d17dc..6c304278d21fd 100644 --- a/paddle/fluid/imperative/tests/test_group.cc +++ b/paddle/fluid/imperative/tests/test_group.cc @@ -96,7 +96,7 @@ void GroupConcatSplit(Place place, size_t size) { { // concat auto* tensor = group.dense_contents_.GetMutable(); tensor->Resize(phi::make_ddim({group.all_length_})) - .mutable_data(place, framework::TransToPtenDataType(group.dtype_)); + .mutable_data(place, framework::TransToPhiDataType(group.dtype_)); group.ConcatTensors(*dev_ctx); group.DivNRanks(*dev_ctx, 1); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 8d3e091dbf5ab..e8e9d895b4e8f 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -26,7 +26,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); diff --git a/paddle/fluid/inference/lite/tensor_utils.cc b/paddle/fluid/inference/lite/tensor_utils.cc index 0e4fb3335f3d7..eeaa128290339 100644 --- a/paddle/fluid/inference/lite/tensor_utils.cc +++ b/paddle/fluid/inference/lite/tensor_utils.cc @@ -198,7 +198,7 @@ void InitDstTensor(framework::LoDTensor* dst, const paddle::lite_api::Tensor& src) { dst->mutable_data( inference::lite::utils::GetNativePlace(src.target()), - framework::TransToPtenDataType(GetNativePrecisionType(src.precision()))); + framework::TransToPhiDataType(GetNativePrecisionType(src.precision()))); SetLoD(dst->mutable_lod(), src.lod()); } @@ -269,7 +269,7 @@ void TensorDataShare(framework::LoDTensor* dst, paddle::lite_api::Tensor* src) { SetLoD(dst->mutable_lod(), src->lod()); dst->ResetHolderWithType( holder, - framework::TransToPtenDataType(GetNativePrecisionType(src->precision()))); + framework::TransToPhiDataType(GetNativePrecisionType(src->precision()))); } } // namespace utils diff --git a/paddle/fluid/operators/benchmark/op_tester.cc b/paddle/fluid/operators/benchmark/op_tester.cc index 915ad2f41cde3..4b1593b1f8b40 100644 --- a/paddle/fluid/operators/benchmark/op_tester.cc +++ b/paddle/fluid/operators/benchmark/op_tester.cc @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "paddle/fluid/pybind/pybind.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" namespace paddle { diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index 4ca0dded3e738..bc6cf9d831ff0 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -138,7 +138,7 @@ class CastOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -// cast use pten kernel, so no need to REGISTER_OP_CPU_KERNEL here. +// cast use phi kernel, so no need to REGISTER_OP_CPU_KERNEL here. REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker, ops::CastOpGradMaker, diff --git a/paddle/fluid/operators/cast_op.h b/paddle/fluid/operators/cast_op.h index 62d747cb9f400..034cb47fab189 100644 --- a/paddle/fluid/operators/cast_op.h +++ b/paddle/fluid/operators/cast_op.h @@ -63,12 +63,12 @@ class CastOpKernel : public framework::OpKernel { out->mutable_data(dev_ctx.GetPlace(), static_cast(out_dtype)); - auto pt_out_dtype = framework::TransToPtenDataType( + auto pt_out_dtype = framework::TransToPhiDataType( static_cast(out_dtype)); // call new kernel phi::CastKernel( - static_cast::TYPE&>(dev_ctx), *in, pt_out_dtype, out); } diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index 25b3a446a0a32..64324d9772b47 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -46,11 +46,11 @@ class CastXPUKernel : public framework::OpKernel { out->mutable_data(dev_ctx.GetPlace(), static_cast(out_dtype)); - auto pt_out_dtype = framework::TransToPtenDataType( + auto pt_out_dtype = framework::TransToPhiDataType( static_cast(out_dtype)); - // call pten kernel + // call phi kernel phi::CastKernel( - static_cast::TYPE&>(dev_ctx), *in, pt_out_dtype, out); } diff --git a/paddle/fluid/operators/cholesky_solve_op.h b/paddle/fluid/operators/cholesky_solve_op.h index 86ed757465495..f25fbbb0c6980 100644 --- a/paddle/fluid/operators/cholesky_solve_op.h +++ b/paddle/fluid/operators/cholesky_solve_op.h @@ -203,7 +203,7 @@ class CholeskySolveGradKernel : public framework::OpKernel { commonterm_conj = helper.Transpose(commonterm_conj); phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), commonterm, commonterm_conj, -1, &commonterm); diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc index 9f27e2238c9c8..900fd4d8d292e 100644 --- a/paddle/fluid/operators/coalesce_tensor_op.cc +++ b/paddle/fluid/operators/coalesce_tensor_op.cc @@ -54,7 +54,7 @@ struct FillConstantVisitor { * = nullptr) const { #ifdef PADDLE_WITH_ASCEND_CL if (platform::is_npu_place(dev_ctx_.GetPlace())) { - Tensor tensor_tmp(framework::TransToPtenDataType(dtype_)); + Tensor tensor_tmp(framework::TransToPhiDataType(dtype_)); tensor_tmp.mutable_data({1}, context_.GetPlace()); FillNpuTensorWithConstant(&tensor_tmp, static_cast(value_)); @@ -194,7 +194,7 @@ class CoalesceTensorOpKernel : public framework::OpKernel { void *fused_tensor_ptr = fused_tensor->Resize(phi::make_ddim({static_cast(numel)})) .mutable_data(context.GetPlace(), - framework::TransToPtenDataType(dtype)); + framework::TransToPhiDataType(dtype)); VLOG(10) << "Fused tensor addr " << fused_tensor_ptr; // Init the continuous space diff --git a/paddle/fluid/operators/conj_op.h b/paddle/fluid/operators/conj_op.h index 2a815ef01e1f7..b2173d1b53104 100644 --- a/paddle/fluid/operators/conj_op.h +++ b/paddle/fluid/operators/conj_op.h @@ -37,7 +37,7 @@ class ConjKernel : public framework::OpKernel { // call new kernel phi::ConjKernel( - static_cast::TYPE&>(dev_ctx), *x, out); } diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index e8c28ebfeb008..7fd0a8eb16475 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -41,9 +41,9 @@ class DotKernel : public framework::OpKernel { out->mutable_data(x->place()); // call new kernel - phi::DotKernel::TYPE>( - static_cast::TYPE&>(dev_ctx), *x, *y, out); } @@ -66,7 +66,7 @@ class DotGradKernel : public framework::OpKernel { // call new kernel phi::DotGradKernel( - static_cast::TYPE&>(dev_ctx), *tensor_x, *tensor_y, *tensor_dout, tensor_dx, tensor_dy); } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index ae2e5b33b5f43..a995877778e47 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -55,7 +55,7 @@ class ElementwiseAddKernel : public framework::OpKernel { auto &dev_ctx = ctx.device_context(); int axis = ctx.Attr("axis"); phi::AddRawKernel( - static_cast::TYPE &>(dev_ctx), *x, *y, axis, z); #endif diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h index 1df43936920a9..c58a7f36548a5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_div_op.h @@ -63,11 +63,11 @@ class ElementwiseDivKernel : public framework::OpKernel { auto& dev_ctx = ctx.device_context(); int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); + auto pt_x = paddle::experimental::MakePhiDenseTensor(*x); + auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); + auto pt_z = paddle::experimental::MakePhiDenseTensor(*z); phi::DivideRawKernel( - static_cast::TYPE&>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu index a452c43ce2c19..45c87a27a180a 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cu @@ -49,9 +49,9 @@ class ElementwiseMulKernel z_lod->mutable_data(ctx.GetPlace()); int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); + auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod); + auto pt_y = paddle::experimental::MakePhiDenseTensor(*y_lod); + auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod); phi::MultiplyRawKernel(static_cast(cuda_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } else { diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index 93713be051599..c81266d584468 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -122,11 +122,11 @@ class ElementwiseMulKernel : public framework::OpKernel { auto& dev_ctx = ctx.device_context(); int axis = ctx.Attr("axis"); - auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); - auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); - auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); + auto pt_x = paddle::experimental::MakePhiDenseTensor(*x_lod); + auto pt_y = paddle::experimental::MakePhiDenseTensor(*y); + auto pt_z = paddle::experimental::MakePhiDenseTensor(*z_lod); phi::MultiplyRawKernel( - static_cast::TYPE&>(dev_ctx), *pt_x.get(), *pt_y.get(), axis, pt_z.get()); } else { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 52de5f77ed325..418779c32e8bc 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -31,18 +31,18 @@ void LaunchElementwiseCudaKernel( std::vector pt_outputs; // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary // DenseTensor obj - // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp + // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp // can be deleted // when DenseTensor support copy constructor. std::vector> pt_inputs_tmp; std::vector> pt_outputs_tmp; for (auto in : ins) { pt_inputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*in))); + std::move(paddle::experimental::MakePhiDenseTensor(*in))); } for (auto out : *outs) { pt_outputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*out))); + std::move(paddle::experimental::MakePhiDenseTensor(*out))); } for (int i = 0; i < pt_inputs_tmp.size(); i++) { pt_inputs.push_back(pt_inputs_tmp[i].get()); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 4a2d92a8c441a..7d7bb4f26fcf4 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/tensor.h" // only can include the headers in paddle/top/api dirs @@ -34,18 +34,18 @@ void LaunchSameDimsElementwiseCudaKernel( std::vector pt_outputs; // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary // DenseTensor obj - // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp + // generated by MakePhiDenseTensor can be destroyed when exits loop. *_tmp // can be deleted // when DenseTensor support copy constructor. std::vector> pt_inputs_tmp; std::vector> pt_outputs_tmp; for (auto in : ins) { pt_inputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*in))); + std::move(paddle::experimental::MakePhiDenseTensor(*in))); } for (auto out : *outs) { pt_outputs_tmp.emplace_back( - std::move(paddle::experimental::MakePtenDenseTensor(*out))); + std::move(paddle::experimental::MakePhiDenseTensor(*out))); } for (int i = 0; i < pt_inputs_tmp.size(); i++) { pt_inputs.push_back(pt_inputs_tmp[i].get()); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.h b/paddle/fluid/operators/elementwise/elementwise_sub_op.h index 87b647f41352f..15c547b493ae0 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.h @@ -34,7 +34,7 @@ class ElementwiseSubKernel : public framework::OpKernel { auto& dev_ctx = ctx.device_context(); int axis = ctx.Attr("axis"); phi::SubtractRawKernel( - static_cast::TYPE&>(dev_ctx), *x, *y, axis, z); } @@ -56,7 +56,7 @@ class ElementwiseSubGradKernel : public ElemwiseGradKernel { auto& dev_ctx = ctx.device_context(); phi::SubtractGradKernel( - static_cast::TYPE&>(dev_ctx), *x, *y, *dout, axis, dx, dy); } @@ -86,7 +86,7 @@ class ElementwiseSubDoubleGradKernel : public framework::OpKernel { ddy_optional = *ddy; } phi::SubtractDoubleGradKernel( - static_cast::TYPE&>(dev_ctx), *y, ddx_optional, ddy_optional, *dout, axis, ddout); } diff --git a/paddle/fluid/operators/empty_op.h b/paddle/fluid/operators/empty_op.h index 42c951385a438..cb466fffcd7c7 100644 --- a/paddle/fluid/operators/empty_op.h +++ b/paddle/fluid/operators/empty_op.h @@ -39,7 +39,7 @@ class EmptyKernel : public framework::OpKernel { out_tensor->Resize(shape); out_tensor->mutable_data(context.GetPlace(), - framework::TransToPtenDataType(dtype)); + framework::TransToPhiDataType(dtype)); } }; diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc index 2a914ff2ebd33..b02e60210c085 100644 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -54,7 +54,7 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { std::isnan(value), false, platform::errors::InvalidArgument("The filled value is NaN.")); - Tensor tensor_tmp(framework::TransToPtenDataType(data_type)); + Tensor tensor_tmp(framework::TransToPhiDataType(data_type)); tensor_tmp.mutable_data({1}, context.GetPlace()); FillNpuTensorWithConstant(&tensor_tmp, static_cast(value)); diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc index 896310cd0918b..ec4ba6e926c41 100644 --- a/paddle/fluid/operators/fill_any_like_op_xpu.cc +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -60,9 +60,9 @@ class FillAnyLikeXPUKernel : public framework::OpKernel { auto& dev_ctx = context.template device_context(); - // call pten kernel + // call phi kernel phi::FullLikeKernel( - static_cast::TYPE&>(dev_ctx), *x, value, phi::DataType::UNDEFINED, out); } diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h index 9d1d1eb7c6af5..31471c6b62268 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h @@ -63,7 +63,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { auto &dev_ctx = *pool.Get(platform::CPUPlace()); phi::funcs::SetConstant functor; out->mutable_data(platform::CPUPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); functor(reinterpret_cast(dev_ctx), out, static_cast(value)); } @@ -72,7 +72,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { auto &dev_ctx = *pool.Get(ctx.GetPlace()); phi::funcs::SetConstant functor; out->mutable_data(ctx.GetPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); functor(reinterpret_cast(dev_ctx), out, static_cast(value)); } diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc index 9ce433a214dd5..5bba4da14aba8 100644 --- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc @@ -72,13 +72,13 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel { auto &dev_ctx = *pool.Get(platform::CPUPlace()); phi::funcs::SetConstant functor; out->mutable_data(platform::CPUPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); functor(reinterpret_cast(dev_ctx), out, static_cast(value)); } else { out->mutable_data(ctx.GetPlace(), - framework::TransToPtenDataType(data_type)); - Tensor tensor_tmp(framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); + Tensor tensor_tmp(framework::TransToPhiDataType(data_type)); tensor_tmp.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_tmp, value); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index eccc53d8766e2..d401b5b82f2b0 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -122,7 +122,7 @@ class FillConstantKernel : public framework::OpKernel { << ((data_type == framework::proto::VarType::BF16) ? "" : ""); tensor->mutable_data(platform::CPUPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(platform::CPUPlace()); functor(reinterpret_cast(dev_ctx), @@ -130,7 +130,7 @@ class FillConstantKernel : public framework::OpKernel { } else if (actual_place == 1) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor->mutable_data(ctx.GetPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(ctx.GetPlace()); functor(reinterpret_cast(dev_ctx), @@ -142,7 +142,7 @@ class FillConstantKernel : public framework::OpKernel { } else if (actual_place == 2) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) tensor->mutable_data(platform::CUDAPinnedPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace()); functor( @@ -155,7 +155,7 @@ class FillConstantKernel : public framework::OpKernel { } else if (actual_place == 3) { #ifdef PADDLE_WITH_XPU tensor->mutable_data(ctx.GetPlace(), - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); phi::funcs::SetConstant functor; auto &dev_ctx = *pool.Get(ctx.GetPlace()); functor(reinterpret_cast(dev_ctx), diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index eb684f818fb08..79018f2a97448 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -61,7 +61,7 @@ class FillConstantNPUKernel : public framework::OpKernel { out_var->mutable_data(shape, ctx.GetPlace()); if (data_type != framework::proto::VarType::BOOL) { - Tensor tensor_value(framework::TransToPtenDataType(data_type)); + Tensor tensor_value(framework::TransToPhiDataType(data_type)); tensor_value.mutable_data({1}, ctx.GetPlace()); FillNpuTensorWithConstant(&tensor_value, value); NpuOpRunner runner; diff --git a/paddle/fluid/operators/fill_op.h b/paddle/fluid/operators/fill_op.h index c202fa23ca891..c5cbffbf5c695 100644 --- a/paddle/fluid/operators/fill_op.h +++ b/paddle/fluid/operators/fill_op.h @@ -49,10 +49,10 @@ class FillKernel : public framework::OpKernel { out.Resize(phi::make_ddim(ctx.Attr>("shape"))); auto dtype = static_cast(ctx.Attr("dtype")); - auto pten_dtype = framework::TransToPtenDataType(dtype); + auto phi_dtype = framework::TransToPhiDataType(dtype); platform::CPUPlace cpu; auto force_cpu = ctx.Attr("force_cpu"); - out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), pten_dtype); + out.mutable_data(force_cpu ? cpu : ctx.GetPlace(), phi_dtype); framework::LoDTensor tensor; @@ -61,7 +61,7 @@ class FillKernel : public framework::OpKernel { } else { // Always make tensor in CPU memory. tensor.Resize(out.dims()); - tensor.mutable_data(cpu, pten_dtype); + tensor.mutable_data(cpu, phi_dtype); } framework::VisitDataType( diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 3605eabfc1d9b..5ef13b38c8a86 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -15,7 +15,7 @@ limitations under the License. */ #pragma once #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/kernels/empty_kernel.h" @@ -132,9 +132,9 @@ class FlattenContiguousRangeKernel : public framework::OpKernel { auto &dev_ctx = context.device_context(); // call new kernel - phi::FlattenKernel::TYPE>( - static_cast::TYPE &>(dev_ctx), *in, start_axis, stop_axis, out); } @@ -153,9 +153,9 @@ class FlattenContiguousRangeGradKernel : public framework::OpKernel { auto &dev_ctx = ctx.device_context(); // call new kernel - phi::FlattenGradKernel::TYPE>( - static_cast::TYPE &>(dev_ctx), *d_out, *xshape, d_x); } diff --git a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu index 962af435b2312..13f1c6808aef2 100644 --- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu +++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu @@ -34,9 +34,9 @@ class EmbeddingEltWiseLayerNormKernel : public framework::OpKernel { int input_num = static_cast(ids.size()); framework::Tensor in_ids_( - framework::TransToPtenDataType(framework::proto::VarType::INT64)), + framework::TransToPhiDataType(framework::proto::VarType::INT64)), in_embs_( - framework::TransToPtenDataType(framework::proto::VarType::INT64)); + framework::TransToPhiDataType(framework::proto::VarType::INT64)); framework::DDim in_dim{input_num}; int device_id; #ifdef PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h index 49f4ff3107026..f323e2e041d99 100644 --- a/paddle/fluid/operators/lu_op.h +++ b/paddle/fluid/operators/lu_op.h @@ -88,8 +88,8 @@ void SetValueCompute(const framework::ExecutionContext& ctx, // set_value is what we want. paddle::framework::TensorCopy(*in, place, out); - Tensor slice_tensor(framework::TransToPtenDataType(dtype)), - pad_tensor(framework::TransToPtenDataType(dtype)); + Tensor slice_tensor(framework::TransToPhiDataType(dtype)), + pad_tensor(framework::TransToPhiDataType(dtype)); slice_tensor.mutable_data(slice_dims, place); pad_tensor.mutable_data(in_dims, place); @@ -147,7 +147,7 @@ void SetValueCompute(const framework::ExecutionContext& ctx, ElementwiseComputeEx, DeviceContext, T>( ctx, &slice_tensor, value_tensor, -1, SubFunctor(), &slice_tensor); } else { - Tensor value_t(framework::TransToPtenDataType(dtype)); + Tensor value_t(framework::TransToPhiDataType(dtype)); auto value_dims = phi::make_ddim(shape); CheckIsDimsMatch(slice_dims_for_assign, value_dims); @@ -224,8 +224,8 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1, out->mutable_data(dev_ctx.GetPlace()); phi::AddRawKernel< - T, typename paddle::framework::ConvertToPtenContext::TYPE>( - static_cast::TYPE>( + static_cast::TYPE&>(dev_ctx), src1, src2, -1, out); } @@ -237,8 +237,8 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1, out->mutable_data(dev_ctx.GetPlace()); phi::SubtractRawKernel< - T, typename paddle::framework::ConvertToPtenContext::TYPE>( - static_cast::TYPE>( + static_cast::TYPE&>(dev_ctx), src1, src2, -1, out); } diff --git a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc index 1f87513bb4bea..2598d3b0277c9 100644 --- a/paddle/fluid/operators/metrics/accuracy_op_mlu.cc +++ b/paddle/fluid/operators/metrics/accuracy_op_mlu.cc @@ -35,8 +35,8 @@ class AccuracyMLUKernel : public framework::OpKernel { } // cast `indices` or `label` if their type is not INT32 - Tensor indices_int32(framework::TransToPtenDataType(VT::INT32)); - Tensor label_int32(framework::TransToPtenDataType(VT::INT32)); + Tensor indices_int32(framework::TransToPhiDataType(VT::INT32)); + Tensor label_int32(framework::TransToPhiDataType(VT::INT32)); auto indices_type = framework::TransToProtoVarType(indices->type()); if (indices_type != VT::INT32) { PADDLE_ENFORCE_EQ(MLUSupportsCast(indices_type, VT::INT32), true, @@ -78,7 +78,7 @@ class AccuracyMLUKernel : public framework::OpKernel { // equal MLUCnnlTensorDesc indices_int32_desc(indices_int32); MLUCnnlTensorDesc label_int32_desc(label_int32); - Tensor equal_tensor(framework::TransToPtenDataType(VT::BOOL)); + Tensor equal_tensor(framework::TransToPhiDataType(VT::BOOL)); equal_tensor.Resize(indices->dims()); equal_tensor.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc equal_tensor_desc(equal_tensor); @@ -88,7 +88,7 @@ class AccuracyMLUKernel : public framework::OpKernel { GetBasePtr(&equal_tensor)); // cast equal - Tensor equal_fp32(framework::TransToPtenDataType(VT::FP32)); + Tensor equal_fp32(framework::TransToPhiDataType(VT::FP32)); equal_fp32.Resize(indices->dims()); equal_fp32.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc equal_fp32_desc(equal_fp32); @@ -99,7 +99,7 @@ class AccuracyMLUKernel : public framework::OpKernel { // [correct] // reduce_max - Tensor correct_max(framework::TransToPtenDataType(VT::FP32)); + Tensor correct_max(framework::TransToPhiDataType(VT::FP32)); correct_max.Resize(phi::make_ddim({num_samples})); correct_max.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc correct_max_desc(correct_max); @@ -112,7 +112,7 @@ class AccuracyMLUKernel : public framework::OpKernel { correct_max_desc.get(), GetBasePtr(&correct_max)); // reduce_sum - Tensor correct_sum(framework::TransToPtenDataType(VT::FP32)); + Tensor correct_sum(framework::TransToPhiDataType(VT::FP32)); correct_sum.Resize(correct->dims()); correct_sum.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc correct_sum_desc(correct_sum); @@ -138,7 +138,7 @@ class AccuracyMLUKernel : public framework::OpKernel { MLUCnnl::Fill(ctx, num_samples, total_desc.get(), GetBasePtr(total)); // use `total` of type `float32` for calculating accuracy - Tensor total_fp32(framework::TransToPtenDataType(VT::FP32)); + Tensor total_fp32(framework::TransToPhiDataType(VT::FP32)); total_fp32.Resize(total->dims()); total_fp32.mutable_data(ctx.GetPlace()); MLUCnnlTensorDesc total_fp32_desc(total_fp32); diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h index 056e0690c01fd..2cbecba9fa081 100644 --- a/paddle/fluid/operators/mlu/mlu_baseop.h +++ b/paddle/fluid/operators/mlu/mlu_baseop.h @@ -85,7 +85,7 @@ inline cnnlDataType_t ToCnnlDataType( inline cnnlDataType_t ToCnnlDataType( const paddle::framework::proto::VarType::Type& type) { - return ToCnnlDataType(framework::TransToPtenDataType(type)); + return ToCnnlDataType(framework::TransToPhiDataType(type)); } template diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index eb39f069e56b7..65cca94814e88 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -257,12 +257,12 @@ class ReduceKernel : public framework::OpKernel { std::vector tmp_dims(dims.begin(), dims.end()); // call new kernel - phi::Reduce::TYPE, - T, Functor>( - static_cast::TYPE, T, + Functor>( + static_cast::TYPE&>(dev_ctx), *input, reduce_all, tmp_dims, keep_dim, - framework::TransToPtenDataType(cast_out_dtype), output); + framework::TransToPhiDataType(cast_out_dtype), output); } }; template @@ -684,7 +684,7 @@ class ReduceCudaKernel : public framework::OpKernel { const Tensor* input = context.Input("X"); Tensor* output = context.Output("Out"); auto out_dtype = context.Attr("out_dtype"); - auto pt_out_dtype = paddle::framework::TransToPtenDataType( + auto pt_out_dtype = paddle::framework::TransToPhiDataType( static_cast(out_dtype)); std::vector dims = context.Attr>("dim"); @@ -714,7 +714,7 @@ class ReduceCudaGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); auto* d_x = context.Output(framework::GradVarName("X")); auto out_dtype = context.Attr("in_dtype"); - auto pt_out_dtype = framework::TransToPtenDataType( + auto pt_out_dtype = framework::TransToPhiDataType( static_cast(out_dtype)); // get reduce_dim and reduce_num for reduce_mean_grad int dim_size = in_x->dims().size(); @@ -735,8 +735,8 @@ class ReduceCudaGradKernel : public framework::OpKernel { } else { d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype()); } - auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out); - auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x); + auto pt_d_out = paddle::experimental::MakePhiDenseTensor(new_d_out); + auto pt_d_x = paddle::experimental::MakePhiDenseTensor(*d_x); if (out_dtype <= 0) { pt_out_dtype = d_out->dtype(); } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 0e74a23523b7d..8d99a60b12967 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/framework/infershape_utils.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" // only can include the headers in paddle/phi/api dirs #include "paddle/phi/api/lib/utils/tensor_utils.h" diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index d6e8f3e5aa108..40f5699a29b35 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -42,9 +42,9 @@ class ScaleXPUKernel : public framework::OpKernel { framework::GetMutableLoDTensorOrSelectedRowsValueFromVar(out_var); out->mutable_data(in->place()); auto& dev_ctx = ctx.template device_context(); - // call pten kernel + // call phi kernel phi::ScaleKernel( - static_cast::TYPE&>(dev_ctx), *in, scale, bias, bias_after_scale, out); } diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc index a51f68530caf8..1cd6f8b7698b9 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_mlu.cc @@ -87,7 +87,7 @@ class SoftmaxWithCrossEntropyMLUKernel : public framework::OpKernel { platform::errors::InvalidArgument( "If soft_label=False, axis must be -1 or" " can be regard as last dimention in mlu kernel.")); - framework::Tensor labels_int32(framework::TransToPtenDataType(VT::INT32)); + framework::Tensor labels_int32(framework::TransToPhiDataType(VT::INT32)); labels_int32.Resize(labels->dims()); labels_int32.mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/top_k_op_mlu.cc b/paddle/fluid/operators/top_k_op_mlu.cc index a9f835f6fe2c2..102902bdaaaaf 100644 --- a/paddle/fluid/operators/top_k_op_mlu.cc +++ b/paddle/fluid/operators/top_k_op_mlu.cc @@ -47,7 +47,7 @@ class TopkMLUKernel : public framework::OpKernel { const bool sorted = true; const int axis = -1; // cnnl only support int32/int16 type of indices - framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32)); + framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32)); indices_int32.Resize(indices->dims()); indices_int32.mutable_data(place); diff --git a/paddle/fluid/operators/top_k_v2_op_mlu.cc b/paddle/fluid/operators/top_k_v2_op_mlu.cc index 7bada0179a1c5..5b8a6b3e75449 100644 --- a/paddle/fluid/operators/top_k_v2_op_mlu.cc +++ b/paddle/fluid/operators/top_k_v2_op_mlu.cc @@ -55,7 +55,7 @@ class TopkV2MLUKernel : public framework::OpKernel { indices->mutable_data(place); // cnnl only support int32/int16 type of indices - framework::Tensor indices_int32(framework::TransToPtenDataType(VT::INT32)); + framework::Tensor indices_int32(framework::TransToPhiDataType(VT::INT32)); indices_int32.Resize(indices->dims()); indices_int32.mutable_data(place); diff --git a/paddle/fluid/operators/uniform_random_inplace_op.cu b/paddle/fluid/operators/uniform_random_inplace_op.cu index 1c7b9a27f8688..b8d8467b7eba9 100644 --- a/paddle/fluid/operators/uniform_random_inplace_op.cu +++ b/paddle/fluid/operators/uniform_random_inplace_op.cu @@ -36,7 +36,7 @@ class GPUUniformRandomInplaceGradKernel : public framework::OpKernel { ctx.template device_context(); float value = static_cast(0.0f); phi::FullKernel( - static_cast::TYPE&>(dev_cxt), dims, value, phi::DataType::UNDEFINED, dx); } diff --git a/paddle/fluid/platform/device/ipu/ipu_executor.cc b/paddle/fluid/platform/device/ipu/ipu_executor.cc index 91ab7f3f4f052..c124d58957fe6 100644 --- a/paddle/fluid/platform/device/ipu/ipu_executor.cc +++ b/paddle/fluid/platform/device/ipu/ipu_executor.cc @@ -113,7 +113,7 @@ void Executor::Run(const std::vector &inputs, auto fetch_dtype = fetch_info.dataType(); auto paddle_type = PopartType2VarType(fetch_dtype); tensor->mutable_data(ctx.GetPlace(), - framework::TransToPtenDataType(paddle_type)); + framework::TransToPhiDataType(paddle_type)); anchor_wrappers.emplace(tensor_id, PaddleIArray(tensor)); popart_anchors.emplace(tensor_id, anchor_wrappers.at(tensor_id)); } diff --git a/paddle/fluid/platform/device/npu/npu_op_runner.cc b/paddle/fluid/platform/device/npu/npu_op_runner.cc index 90c0851d79d80..d45492391dc88 100644 --- a/paddle/fluid/platform/device/npu/npu_op_runner.cc +++ b/paddle/fluid/platform/device/npu/npu_op_runner.cc @@ -467,7 +467,7 @@ void NpuOpRunner::TypeAdapter( } else { tmp_inputs[i].Resize(inputs[i].dims()); tmp_inputs[i].mutable_data(dev_ctx.GetPlace(), - framework::TransToPtenDataType(input_type[i])); + framework::TransToPhiDataType(input_type[i])); const auto &cast_runner = NpuOpRunner( "Cast", {inputs[i]}, {tmp_inputs[i]}, @@ -484,7 +484,7 @@ void NpuOpRunner::TypeAdapter( } else { tmp_outputs[i].Resize(outputs[i].dims()); tmp_outputs[i].mutable_data( - dev_ctx.GetPlace(), framework::TransToPtenDataType(output_type[i])); + dev_ctx.GetPlace(), framework::TransToPhiDataType(output_type[i])); } } diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 285c6a4c13053..01de7349f4823 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -1056,7 +1056,7 @@ class ReorderMKLDNNHandler { platform::Place place) { auto dst_md = platform::MKLDNNMemDesc(dims_, dtype_dst_, fmt); auto dst_data = output->mutable_data( - place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size()); + place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size()); return std::make_shared(dst_md, engine_, dst_data); } @@ -1065,7 +1065,7 @@ class ReorderMKLDNNHandler { const MKLDNNMemoryFormat& fmt, platform::Place place) { auto dst_md = platform::MKLDNNMemDesc(dims, dtype_dst_, fmt); auto dst_data = output->mutable_data( - place, framework::TransToPtenDataType(vtype_dst_), dst_md.get_size()); + place, framework::TransToPhiDataType(vtype_dst_), dst_md.get_size()); return std::make_shared(dst_md, engine_, dst_data); } diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h index 49690d1c66be7..6f714a677033b 100644 --- a/paddle/fluid/platform/transform.h +++ b/paddle/fluid/platform/transform.h @@ -59,7 +59,7 @@ struct Transform { BinaryOperation op); }; -// NOTE: After the pten kernel is migrated, it needs to be deleted. +// NOTE: After the phi kernel is migrated, it needs to be deleted. template <> struct Transform { template diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc index d9a2dcb686909..1052f93d32ec3 100644 --- a/paddle/fluid/pybind/eager.cc +++ b/paddle/fluid/pybind/eager.cc @@ -75,7 +75,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name, std::shared_ptr dense_tensor = std::make_shared( phi::make_intrusive(place), - phi::DenseTensorMeta(paddle::framework::TransToPtenDataType(dtype), + phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype), ddims)); if (phi::product(ddims) > 0) { dense_tensor->mutable_data(place); @@ -133,7 +133,7 @@ void InitTensorWithTensor(TensorObject* self, VLOG(4) << "Same place, do ShareDataWith"; } else { self->tensor.set_impl( - src.copy_to(phi::TransToPtenBackend(place), true).impl()); + src.copy_to(phi::TransToPhiBackend(place), true).impl()); VLOG(4) << "Different place, do TensorCopy"; } if (src.get_autograd_meta()) { @@ -157,7 +157,7 @@ void InitTensorWithFrameworkTensor(TensorObject* self, auto temp = paddle::experimental::Tensor(std::make_shared(src)); self->tensor.set_impl( - temp.copy_to(phi::TransToPtenBackend(place), true).impl()); + temp.copy_to(phi::TransToPhiBackend(place), true).impl()); VLOG(4) << "Different place, do TensorCopy"; } egr::EagerUtils::autograd_meta(&(self->tensor))->SetPersistable(false); diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc index b825e9265a8cd..0b04dc7347ce7 100644 --- a/paddle/fluid/pybind/eager_functions.cc +++ b/paddle/fluid/pybind/eager_functions.cc @@ -135,7 +135,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self, PyObject* args, auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 2), 2); bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 3), 3); - dst = src.copy_to(phi::TransToPtenBackend(place), blocking); + dst = src.copy_to(phi::TransToPhiBackend(place), blocking); egr::EagerUtils::autograd_meta(&dst)->SetStopGradient( egr::EagerUtils::autograd_meta(&(src))->StopGradient()); egr::EagerUtils::autograd_meta(&dst)->SetPersistable( diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc index 221d4d53d0663..f11a2ab2517fb 100644 --- a/paddle/fluid/pybind/eager_method.cc +++ b/paddle/fluid/pybind/eager_method.cc @@ -191,7 +191,7 @@ static PyObject* tensor_method__copy_to(TensorObject* self, PyObject* args, bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 0), 0); auto place = CastPyArg2Place(PyTuple_GET_ITEM(args, 1), 1); auto cp_tensor = - self->tensor.copy_to(phi::TransToPtenBackend(place), blocking); + self->tensor.copy_to(phi::TransToPhiBackend(place), blocking); egr::EagerUtils::autograd_meta(&cp_tensor)->SetStopGradient(true); egr::EagerUtils::autograd_meta(&cp_tensor) ->SetPersistable( diff --git a/paddle/fluid/pybind/eager_op_function_generator.cc b/paddle/fluid/pybind/eager_op_function_generator.cc index 4fe47d5a8427d..c15c171799f44 100644 --- a/paddle/fluid/pybind/eager_op_function_generator.cc +++ b/paddle/fluid/pybind/eager_op_function_generator.cc @@ -32,7 +32,7 @@ #endif #include "paddle/fluid/pybind/op_function_generator.h" -// pten +// phi #include "paddle/phi/kernels/declarations.h" // clang-format off @@ -365,9 +365,9 @@ GenerateOpFunctions() { auto& op_type = op_proto->type(); // Skip ooerator which is not inherit form OperatorWithKernel, like while, // since only OperatorWithKernel can run in dygraph mode. - // if the pten lib contains op kernel, we still generate ops method + // if the phi lib contains op kernel, we still generate ops method if (!all_kernels.count(op_type) && - !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) { + !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) { continue; } std::string func_name = "eager_api_" + op_type; diff --git a/paddle/fluid/pybind/kernel_signature_generator.cc b/paddle/fluid/pybind/kernel_signature_generator.cc index 14e4fac7cdd95..8283a249ded4c 100644 --- a/paddle/fluid/pybind/kernel_signature_generator.cc +++ b/paddle/fluid/pybind/kernel_signature_generator.cc @@ -15,7 +15,7 @@ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/pten_utils.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/phi/core/compat/op_utils.h" #include "paddle/phi/core/kernel_factory.h" diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index cbbe56985b2ad..9d5bcfac494cb 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -32,7 +32,7 @@ #include "paddle/fluid/framework/fleet/ascend_wrapper.h" #endif -// pten +// phi #include "paddle/phi/kernels/declarations.h" // NOTE(pangyoki): Inplace OP with duplicable input. @@ -400,9 +400,9 @@ GenerateOpFunctions() { auto& op_type = op_proto->type(); // Skip operator which is not inherit form OperatorWithKernel, like while, // since only OperatorWithKernel can run in dygraph mode. - // if the pten lib contains op kernel, we still generate ops method + // if the phi lib contains op kernel, we still generate ops method if (!all_kernels.count(op_type) && - !phi::KernelFactory::Instance().HasCompatiblePtenKernel(op_type)) { + !phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type)) { continue; } diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 1ea9c7c65d5f5..6e553ad2e60e2 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -50,8 +50,8 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/parallel_executor.h" +#include "paddle/fluid/framework/phi_utils.h" #include "paddle/fluid/framework/prune.h" -#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/save_load_util.h" #include "paddle/fluid/framework/scope_pool.h" @@ -464,7 +464,7 @@ static void inline CreateVariableIfNotExit( tensor_temp->Resize(phi::make_ddim(var_desc.GetShape())); tensor_temp->mutable_data( exe->GetPlace(), - framework::TransToPtenDataType(var_desc.GetDataType())); + framework::TransToPhiDataType(var_desc.GetDataType())); } } } else { @@ -671,60 +671,60 @@ PYBIND11_MODULE(core_noavx, m) { m.def("_get_use_default_grad_op_desc_maker_ops", [] { return OpInfoMap::Instance().GetUseDefaultGradOpDescMakerOps(); }); - m.def( - "_get_all_register_op_kernels", - [](const std::string &lib) { - std::unordered_map> - all_kernels_info; - if (lib == "fluid" || lib == "all") { - auto &all_kernels = - paddle::framework::OperatorWithKernel::AllOpKernels(); - - for (auto &kernel_pair : all_kernels) { - auto op_type = kernel_pair.first; - std::vector kernel_types; - for (auto &info_pair : kernel_pair.second) { - paddle::framework::OpKernelType kernel_type = info_pair.first; - kernel_types.emplace_back( - paddle::framework::KernelTypeToString(kernel_type)); + m.def("_get_all_register_op_kernels", + [](const std::string &lib) { + std::unordered_map> + all_kernels_info; + if (lib == "fluid" || lib == "all") { + auto &all_kernels = + paddle::framework::OperatorWithKernel::AllOpKernels(); + + for (auto &kernel_pair : all_kernels) { + auto op_type = kernel_pair.first; + std::vector kernel_types; + for (auto &info_pair : kernel_pair.second) { + paddle::framework::OpKernelType kernel_type = info_pair.first; + kernel_types.emplace_back( + paddle::framework::KernelTypeToString(kernel_type)); + } + all_kernels_info.emplace(op_type, kernel_types); } - all_kernels_info.emplace(op_type, kernel_types); } - } - if (lib == "pten" || lib == "all") { - auto pten_kernels = phi::KernelFactory::Instance().kernels(); - for (auto &kernel_pair : pten_kernels) { - auto op_type = phi::TransToFluidOpName(kernel_pair.first); - std::vector kernel_types; - for (auto &info_pair : kernel_pair.second) { - framework::OpKernelType kernel_type = - framework::TransPtenKernelKeyToOpKernelType(info_pair.first); - auto kernel_type_str = framework::KernelTypeToString(kernel_type); - if (all_kernels_info.count(op_type)) { - if (std::find(all_kernels_info[op_type].begin(), - all_kernels_info[op_type].end(), - kernel_type_str) == - all_kernels_info[op_type].end()) { - all_kernels_info[op_type].emplace_back(kernel_type_str); + if (lib == "phi" || lib == "all") { + auto phi_kernels = phi::KernelFactory::Instance().kernels(); + for (auto &kernel_pair : phi_kernels) { + auto op_type = phi::TransToFluidOpName(kernel_pair.first); + std::vector kernel_types; + for (auto &info_pair : kernel_pair.second) { + framework::OpKernelType kernel_type = + framework::TransPhiKernelKeyToOpKernelType(info_pair.first); + auto kernel_type_str = + framework::KernelTypeToString(kernel_type); + if (all_kernels_info.count(op_type)) { + if (std::find(all_kernels_info[op_type].begin(), + all_kernels_info[op_type].end(), + kernel_type_str) == + all_kernels_info[op_type].end()) { + all_kernels_info[op_type].emplace_back(kernel_type_str); + } + } else { + kernel_types.emplace_back(kernel_type_str); } - } else { - kernel_types.emplace_back(kernel_type_str); } - } - if (!kernel_types.empty()) { - all_kernels_info.emplace(op_type, kernel_types); + if (!kernel_types.empty()) { + all_kernels_info.emplace(op_type, kernel_types); + } } } - } - return all_kernels_info; - }, - py::arg("lib") = "all", - R"DOC( + return all_kernels_info; + }, + py::arg("lib") = "all", + R"DOC( Return the registered kernels in paddle. Args: - lib[string]: the libarary, could be 'pten', 'fluid' and 'all'. + lib[string]: the libarary, could be 'phi', 'fluid' and 'all'. )DOC"); // NOTE(zjl): ctest would load environment variables at the beginning even @@ -823,39 +823,39 @@ PYBIND11_MODULE(core_noavx, m) { .def("_mutable_data", [](framework::Tensor &self, paddle::platform::CPUPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::XPUPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::CUDAPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::CUDAPinnedPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::MLUPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_clear", &framework::Tensor::clear) .def("_mutable_data", [](framework::Tensor &self, paddle::platform::NPUPlace &place, paddle::framework::proto::VarType::Type type) { - return reinterpret_cast(self.mutable_data( - place, framework::TransToPtenDataType(type))); + return reinterpret_cast( + self.mutable_data(place, framework::TransToPhiDataType(type))); }) .def("_copy_from", &TensorCopyFrom, py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 49bacc1cd6d85..e7abd64ec4439 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -324,7 +324,7 @@ void SetTensorFromPyArrayT( if (zero_copy) { auto holder = std::make_shared>(array); auto type = framework::ToDataType(std::type_index(typeid(T))); - self->ResetHolderWithType(holder, framework::TransToPtenDataType(type)); + self->ResetHolderWithType(holder, framework::TransToPhiDataType(type)); } else { auto dst = self->mutable_data(place); std::memcpy(dst, array.data(), array.nbytes()); @@ -348,7 +348,7 @@ void SetTensorFromPyArrayT( if (zero_copy) { auto holder = std::make_shared>(array); auto type = framework::ToDataType(std::type_index(typeid(T))); - self->ResetHolderWithType(holder, framework::TransToPtenDataType(type)); + self->ResetHolderWithType(holder, framework::TransToPhiDataType(type)); } else { // IPU does not store Tensor data, Tensor will be created on CPU if (!self->initialized()) { @@ -518,7 +518,7 @@ void SetUVATensorFromPyArray( cuda_device_pointer, need_allocate_size, platform::CUDAPlace(device_id)); self_tensor->ResetHolderWithType(holder, - framework::TransToPtenDataType(data_type)); + framework::TransToPhiDataType(data_type)); #endif } diff --git a/paddle/phi/api/all.h b/paddle/phi/api/all.h index 748ed11058af6..154b84670aaf9 100644 --- a/paddle/phi/api/all.h +++ b/paddle/phi/api/all.h @@ -24,12 +24,12 @@ limitations under the License. */ #endif #endif -// new pten apis +// new phi apis #include "paddle/phi/api/include/api.h" #include "paddle/phi/api/include/sparse_api.h" #include "paddle/phi/api/include/tensor.h" -// pten common headers +// phi common headers #include "paddle/phi/common/backend.h" #include "paddle/phi/common/data_type.h" #include "paddle/phi/common/layout.h" diff --git a/paddle/phi/api/include/tensor.h b/paddle/phi/api/include/tensor.h index db0c28198e80a..c268742fa567b 100644 --- a/paddle/phi/api/include/tensor.h +++ b/paddle/phi/api/include/tensor.h @@ -49,8 +49,6 @@ namespace paddle { namespace experimental { -class CompatiblePTenTensorUtils; - class AbstractAutogradMeta { public: // No AbstractAutogradMeta should be created @@ -59,7 +57,7 @@ class AbstractAutogradMeta { /** * Tensor is the API description of the basic data structure in the - * [ "Paddle Tensor Operation (pten)" Library ]. + * [ "Paddle Tensor Operation (phi)" Library ]. * * It is not limited to a simple n-dimensional array. * It contains a smart pointer to `TensorImpl`. The data description contained @@ -366,7 +364,7 @@ class PADDLE_API Tensor final { /* Part 5: Data Transform methods */ /* Alert!!!!: All copy method can only deep copy impl, autograd info only be * copied */ - /* out of pten */ + /* out of phi */ /** * @brief Copy the current Tensor data to the specified device * and return the new Tensor. It's usually used to set the input tensor data. @@ -476,9 +474,6 @@ class PADDLE_API Tensor final { /* Part 9: Auto generated Tensor methods */ - private: - friend class CompatiblePTenTensorUtils; - private: /** * [ Why use abstract TensorImpl interface here? ] diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc index 89a51dde46312..c7400b93fcdc1 100644 --- a/paddle/phi/api/lib/api_custom_impl.cc +++ b/paddle/phi/api/lib/api_custom_impl.cc @@ -58,7 +58,7 @@ Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { auto* kernel_fn = kernel.GetVariadicKernelFn(); (*kernel_fn)( - *dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out); + *dev_ctx, *dense_x, phi::TransToPhiPlace(backend), blocking, kernel_out); return out; } diff --git a/paddle/phi/api/lib/api_registry.h b/paddle/phi/api/lib/api_registry.h index 3783620ea449b..212a2f96452f6 100644 --- a/paddle/phi/api/lib/api_registry.h +++ b/paddle/phi/api/lib/api_registry.h @@ -27,7 +27,7 @@ namespace experimental { #endif /** - * Now there is no module to call pten's API. When compiling, the function + * Now there is no module to call phi's API. When compiling, the function * implementation will be optimized. Therefore, the symbol will be exposed * manually for the time being. * @@ -41,7 +41,7 @@ namespace experimental { #define PD_DECLARE_API(name) \ extern PADDLE_API int RegisterSymbolsFor##name(); \ - UNUSED static int use_pten_api_##name = RegisterSymbolsFor##name() + UNUSED static int use_phi_api_##name = RegisterSymbolsFor##name() } // namespace experimental } // namespace paddle diff --git a/paddle/phi/api/lib/api_utils.h b/paddle/phi/api/lib/api_utils.h index d44dde3b74dd2..6c1fa97c0f52a 100644 --- a/paddle/phi/api/lib/api_utils.h +++ b/paddle/phi/api/lib/api_utils.h @@ -106,7 +106,7 @@ inline paddle::optional MakeMetaTensor( inline phi::DenseTensor* SetKernelOutput(Backend backend, Tensor* out) { if (!out->initialized()) { auto dense_tensor = std::make_shared( - phi::make_intrusive(phi::TransToPtenPlace(backend)), + phi::make_intrusive(phi::TransToPhiPlace(backend)), phi::DenseTensorMeta()); out->set_impl(dense_tensor); return dense_tensor.get(); @@ -120,7 +120,7 @@ inline std::vector SetKernelOutput( std::vector results(out_size); for (size_t i = 0; i < out_size; ++i) { auto tensor_ptr = std::make_shared( - phi::make_intrusive(phi::TransToPtenPlace(backend)), + phi::make_intrusive(phi::TransToPhiPlace(backend)), phi::DenseTensorMeta()); results[i] = tensor_ptr.get(); out->emplace_back(); diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc index 2074ddd8a9127..ae67e2ebb35cc 100644 --- a/paddle/phi/api/lib/data_transform.cc +++ b/paddle/phi/api/lib/data_transform.cc @@ -38,7 +38,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input, const TransformFlag& transform_flag) { bool ret = transform_flag.need_trans_backend() && target != Backend::ALL_BACKEND && - !platform::is_same_place(input, phi::TransToPtenPlace(target)); + !platform::is_same_place(input, phi::TransToPhiPlace(target)); return ret; } @@ -168,10 +168,10 @@ phi::DenseTensor TransformData(const phi::DenseTensor& tensor, out.place(), target_args_def.backend, transform_flag)) { phi::DenseTensor result( phi::make_intrusive( - phi::TransToPtenPlace(target_args_def.backend)), + phi::TransToPhiPlace(target_args_def.backend)), {out.dtype(), out.dims(), out.layout()}); framework::TransDataDevice( - out, phi::TransToPtenPlace(target_args_def.backend), &result); + out, phi::TransToPhiPlace(target_args_def.backend), &result); out = result; } return out; diff --git a/paddle/phi/api/lib/kernel_dispatch.cc b/paddle/phi/api/lib/kernel_dispatch.cc index 5251473f3b5c9..0e3ca1af4967c 100644 --- a/paddle/phi/api/lib/kernel_dispatch.cc +++ b/paddle/phi/api/lib/kernel_dispatch.cc @@ -21,7 +21,7 @@ namespace experimental { namespace detail { BackendSet GetTensorBackendSet(const Tensor& t) { - BackendSet backend_set(phi::TransToPtenBackend(t.inner_place())); + BackendSet backend_set(phi::TransToPhiBackend(t.inner_place())); switch (t.layout()) { case DataLayout::MKLDNN: backend_set = backend_set | BackendSet(Backend::MKLDNN); @@ -53,7 +53,7 @@ std::size_t CountLeadingZeros(uint64_t val) { phi::DeviceContext* GetDeviceContextByBackend(phi::Backend backend) { auto& pool = paddle::platform::DeviceContextPool::Instance(); - return pool.Get(phi::TransToPtenPlace(backend)); + return pool.Get(phi::TransToPhiPlace(backend)); } DataType ParseDataType(DataType dtype) { return dtype; } @@ -83,7 +83,7 @@ DataType ParseDataTypeWithInputOrder(DataType dtype, const Tensor& tensor) { Backend ParseBackend(Backend backend) { return backend; } Backend ParseBackend(const Tensor& tensor) { - return phi::TransToPtenBackend(tensor.inner_place()); + return phi::TransToPhiBackend(tensor.inner_place()); } Backend ParseBackendWithInputOrder(Backend backend, const Tensor& tensor) { diff --git a/paddle/phi/api/lib/sparse_api.cc b/paddle/phi/api/lib/sparse_api.cc index c0c10e0ac6a48..9e1f59c0aa743 100644 --- a/paddle/phi/api/lib/sparse_api.cc +++ b/paddle/phi/api/lib/sparse_api.cc @@ -86,11 +86,11 @@ PADDLE_API Tensor to_sparse_coo(const Tensor& x, // create empty SparseCooTensor phi::DenseTensor non_zero_indices( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(indices_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(elements_meta)); auto coo = std::make_shared( non_zero_indices, non_zero_elements, x.dims()); @@ -148,15 +148,15 @@ PADDLE_API Tensor to_sparse_csr(const Tensor& x, Backend backend) { // create empty SparseCooTensor phi::DenseTensor non_zero_crows( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(crows_meta)); phi::DenseTensor non_zero_cols( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(cols_meta)); phi::DenseTensor non_zero_elements( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(elements_meta)); auto csr = std::make_shared( non_zero_crows, non_zero_cols, non_zero_elements, x.dims()); @@ -211,7 +211,7 @@ PADDLE_API Tensor to_dense(const Tensor& x, Backend backend) { // create empty SparseCooTensor auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), std::move(dense_meta)); kernel_context.EmplaceBackOutput(dense_out.get()); diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index ada08019f678a..311dd0fc30941 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -33,7 +33,7 @@ limitations under the License. */ * * We hope to organize the basic implementation of Tensor and the logic related * to Tensor computation into an independent library, which we call - * [Tensor Operation Library, pten], so we extract or rewrite the original + * [Tensor Operation Library, phi], so we extract or rewrite the original * Kernels. * * In the future, the training library, inference library and custom operators diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc index 7308a9d752c7a..aefa26952d1e5 100644 --- a/paddle/phi/api/lib/tensor_method.cc +++ b/paddle/phi/api/lib/tensor_method.cc @@ -88,7 +88,7 @@ void Tensor::copy_(const Tensor &src, bool blocking) { src.name())); } auto copy_tensor = - src.copy_to(phi::TransToPtenBackend(src.inner_place()), blocking); + src.copy_to(phi::TransToPhiBackend(src.inner_place()), blocking); set_impl(copy_tensor.impl()); } diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc index fc56d201fe3cc..31325e22afae3 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.cc +++ b/paddle/phi/api/lib/utils/tensor_utils.cc @@ -31,13 +31,13 @@ void SetLoD(DstLoD* dst, const SrcLoD& src) { } } -std::unique_ptr MakePtenDenseTensor( +std::unique_ptr MakePhiDenseTensor( const paddle::framework::Tensor& src) { return std::make_unique(src); } -phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) { - auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU); +phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable) { + auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); if (variable.IsType()) { const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.place(), expected_place)) { @@ -55,21 +55,21 @@ phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable) { } } -phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src) { +phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src) { return {src}; } -phi::ScalarArray MakePtenScalarArrayFromVar( +phi::ScalarArray MakePhiScalarArrayFromVar( const framework::Variable& variable) { - auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU); + auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); if (variable.IsType()) { const auto& tensor = variable.Get(); if (!platform::is_same_place(tensor.place(), expected_place)) { framework::LoDTensor tmp_tensor; framework::TensorCopySync(tensor, expected_place, &tmp_tensor); - return MakePtenScalarArray(tmp_tensor); + return MakePhiScalarArray(tmp_tensor); } else { - return MakePtenScalarArray(tensor); + return MakePhiScalarArray(tensor); } } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -80,12 +80,12 @@ phi::ScalarArray MakePtenScalarArrayFromVar( } // TODO(chentianyu03): Inplace with ScalarArray constructor -phi::ScalarArray MakePtenScalarArrayFromVarList( +phi::ScalarArray MakePhiScalarArrayFromVarList( const std::vector& variable_list) { if (variable_list.size() == 0) { return phi::ScalarArray(); } - auto expected_place = phi::TransToPtenPlace(phi::Backend::CPU); + auto expected_place = phi::TransToPhiPlace(phi::Backend::CPU); std::vector vector_data; vector_data.reserve(variable_list.size()); diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h index 51aca6a52b41c..8b30d5421ab94 100644 --- a/paddle/phi/api/lib/utils/tensor_utils.h +++ b/paddle/phi/api/lib/utils/tensor_utils.h @@ -30,17 +30,16 @@ limitations under the License. */ namespace paddle { namespace experimental { -std::unique_ptr MakePtenDenseTensor( +std::unique_ptr MakePhiDenseTensor( const paddle::framework::Tensor& src); -phi::ScalarArray MakePtenScalarArray(const paddle::framework::Tensor& src); +phi::ScalarArray MakePhiScalarArray(const paddle::framework::Tensor& src); -phi::Scalar MakePtenScalarFromVar(const framework::Variable& variable); +phi::Scalar MakePhiScalarFromVar(const framework::Variable& variable); -phi::ScalarArray MakePtenScalarArrayFromVar( - const framework::Variable& variable); +phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable); -phi::ScalarArray MakePtenScalarArrayFromVarList( +phi::ScalarArray MakePhiScalarArrayFromVarList( const std::vector& variable_list); void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst, diff --git a/paddle/phi/backends/all_context.h b/paddle/phi/backends/all_context.h index 3fe03905e42dd..57e6f084fd4c9 100644 --- a/paddle/phi/backends/all_context.h +++ b/paddle/phi/backends/all_context.h @@ -18,7 +18,7 @@ limitations under the License. */ // In order to avoid including the header files of each backend in turn, // add this header file // Note: Limit the entry of DeviceContext to backends to avoid multiple include -// path replacement after implementing pten DeviceContext +// path replacement after implementing phi DeviceContext #include "paddle/phi/backends/cpu/cpu_context.h" #include "paddle/phi/backends/custom/custom_context.h" diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h index e67df65850f15..aa14c2a8e3862 100644 --- a/paddle/phi/backends/cpu/cpu_context.h +++ b/paddle/phi/backends/cpu/cpu_context.h @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/backends/cpu/forwards.h" #include "paddle/phi/core/device_context.h" -// TODO(wilber): Do we need to use place in pten kernel? +// TODO(wilber): Do we need to use place in phi kernel? #include "paddle/phi/common/place.h" namespace phi { diff --git a/paddle/phi/backends/gpu/gpu_context.cc b/paddle/phi/backends/gpu/gpu_context.cc index 28057abed542a..dbcc1660c6472 100644 --- a/paddle/phi/backends/gpu/gpu_context.cc +++ b/paddle/phi/backends/gpu/gpu_context.cc @@ -49,7 +49,7 @@ limitations under the License. */ // without eigen. #include "unsupported/Eigen/CXX11/Tensor" -// TODO(pten): remove fluid header. +// TODO(phi): remove fluid header. #include "paddle/fluid/platform/enforce.h" namespace phi { diff --git a/paddle/phi/backends/gpu/rocm/rocm_info.cc b/paddle/phi/backends/gpu/rocm/rocm_info.cc index 11dd4f7248782..23e58d34b2572 100644 --- a/paddle/phi/backends/gpu/rocm/rocm_info.cc +++ b/paddle/phi/backends/gpu/rocm/rocm_info.cc @@ -15,7 +15,7 @@ #include #include "paddle/phi/backends/gpu/gpu_info.h" -// TODO(pten): remove fluid headers. +// TODO(phi): remove fluid headers. #include "paddle/fluid/platform/enforce.h" static std::once_flag g_device_props_size_init_flag; diff --git a/paddle/phi/backends/xpu/xpu_info.cc b/paddle/phi/backends/xpu/xpu_info.cc index 96e95df7a9886..d454fc0734c66 100644 --- a/paddle/phi/backends/xpu/xpu_info.cc +++ b/paddle/phi/backends/xpu/xpu_info.cc @@ -19,7 +19,7 @@ limitations under the License. */ #include "paddle/phi/backends/xpu/xpu_header.h" #include "paddle/phi/common/place.h" -// TODO(wilber): The pten computing library requires a component to manage +// TODO(wilber): The phi computing library requires a component to manage // flags. #include "paddle/fluid/platform/flags.h" diff --git a/paddle/phi/common/layout.h b/paddle/phi/common/layout.h index 30832bd60bc0e..648fc02d054cb 100644 --- a/paddle/phi/common/layout.h +++ b/paddle/phi/common/layout.h @@ -32,7 +32,7 @@ enum class DataLayout { NUM_DATA_LAYOUTS, // See Note [ Why we need ALL in basic kernel key member? ] ALL_LAYOUT = UNDEFINED, - // Note: Unify pten DataLayout and fluid::framework::DataLayout, + // Note: Unify phi DataLayout and fluid::framework::DataLayout, // for compatible with fluid DataLayout, here need prefix `k` // Note: The original `kAnyLayout (enum value 2)` is a strange design. // `kAnyLayout` originally cannot represent any kind of Layout, diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index bc179e8fed74e..644bf3679af2a 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -43,7 +43,7 @@ const char *AllocationTypeStr(AllocationType type) { case AllocationType::MLU: return "mlu"; default: - PD_THROW("Invalid pten device type."); + PD_THROW("Invalid phi device type."); return {}; } } diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h index 39cb3fb569267..af29b3bab5c3c 100644 --- a/paddle/phi/core/compat/arg_map_context.h +++ b/paddle/phi/core/compat/arg_map_context.h @@ -79,7 +79,7 @@ class ArgumentMappingContext { virtual bool HasOutput(const std::string& name) const = 0; virtual bool HasAttr(const std::string& name) const = 0; - // now we can't use Attribute here, it will cause pten relay on + // now we can't use Attribute here, it will cause phi relay on // boost::variant and BlockDesc virtual paddle::any Attr(const std::string& name) const = 0; diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc index f7dab1d34c980..3b7a733ede904 100644 --- a/paddle/phi/core/compat/convert_utils.cc +++ b/paddle/phi/core/compat/convert_utils.cc @@ -25,7 +25,7 @@ limitations under the License. */ namespace phi { -Backend TransToPtenBackend(const phi::Place& place) { +Backend TransToPhiBackend(const phi::Place& place) { if (place.GetType() == phi::AllocationType::CPU) { return Backend::CPU; } else if (place.GetType() == phi::AllocationType::GPU) { @@ -41,7 +41,7 @@ Backend TransToPtenBackend(const phi::Place& place) { } } -phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) { +phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) { // NOTE(zhiqiu): GetCurrentDeviceId not always success, and device id is not // always needed. // So, add set_device_id parameter here. @@ -87,21 +87,21 @@ phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id) { } } -std::string TransToPtenKernelName(const std::string& fluid_op_name) { +std::string TransToPhiKernelName(const std::string& fluid_op_name) { return OpUtilsMap::Instance().GetBaseKernelName(fluid_op_name); } -const std::string& TransToFluidOpName(const std::string& pten_kernel_name) { +const std::string& TransToFluidOpName(const std::string& phi_kernel_name) { auto& base_kernel_name_map = OpUtilsMap::Instance().base_kernel_name_map(); auto it = std::find_if(base_kernel_name_map.begin(), base_kernel_name_map.end(), - [&pten_kernel_name](const auto& pair) { - return pair.second == pten_kernel_name; + [&phi_kernel_name](const auto& pair) { + return pair.second == phi_kernel_name; }); if (it != base_kernel_name_map.end()) { return it->first; } - return pten_kernel_name; + return phi_kernel_name; } } // namespace phi diff --git a/paddle/phi/core/compat/convert_utils.h b/paddle/phi/core/compat/convert_utils.h index 058f0ecdf7bc2..621459764873e 100644 --- a/paddle/phi/core/compat/convert_utils.h +++ b/paddle/phi/core/compat/convert_utils.h @@ -22,10 +22,10 @@ limitations under the License. */ namespace phi { -std::string TransToPtenKernelName(const std::string& fluid_op_name); -const std::string& TransToFluidOpName(const std::string& pten_kernel_name); +std::string TransToPhiKernelName(const std::string& fluid_op_name); +const std::string& TransToFluidOpName(const std::string& phi_kernel_name); -Backend TransToPtenBackend(const phi::Place& place); -phi::Place TransToPtenPlace(const Backend& backend, bool set_device_id = true); +Backend TransToPhiBackend(const phi::Place& place); +phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id = true); } // namespace phi diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc index 75ff9cc286003..f84a2bd8d9c5d 100644 --- a/paddle/phi/core/custom_kernel.cc +++ b/paddle/phi/core/custom_kernel.cc @@ -22,7 +22,7 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) { for (auto& pair : kernel_info_map) { PADDLE_ENFORCE_EQ( - KernelFactory::Instance().HasCompatiblePtenKernel(pair.first), + KernelFactory::Instance().HasCompatiblePhiKernel(pair.first), true, phi::errors::InvalidArgument( "The kernel %s is not ready for custom kernel registering.", diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc index 6ce8bea35d9dd..29e7dc01f32db 100644 --- a/paddle/phi/core/dense_tensor_impl.cc +++ b/paddle/phi/core/dense_tensor_impl.cc @@ -161,7 +161,7 @@ void* DenseTensor::mutable_data(const Place& place, /* @jim19930609: The following "mutable_data" only supports specific dtypes defined in OpProto. This part need another clean up once the data type across Fluid - and Pten get unified. + and Phi get unified. */ template inline T* DenseTensor::mutable_data(const DDim& dims, diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h index b31bedd958b4b..be91409762635 100644 --- a/paddle/phi/core/kernel_factory.h +++ b/paddle/phi/core/kernel_factory.h @@ -225,8 +225,8 @@ class KernelFactory { KernelNameMap& kernels() { return kernels_; } - bool HasCompatiblePtenKernel(const std::string& op_type) const { - return kernels_.find(TransToPtenKernelName(op_type)) != kernels_.end(); + bool HasCompatiblePhiKernel(const std::string& op_type) const { + return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end(); } const Kernel& SelectKernelOrThrowError(const std::string& kernel_name, diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h index efb01d6664238..a190b222f86ac 100644 --- a/paddle/phi/core/utils/data_type.h +++ b/paddle/phi/core/utils/data_type.h @@ -23,39 +23,39 @@ limitations under the License. */ namespace phi { -#define _PtenForEachDataTypeHelper_(callback, cpp_type, data_type) \ +#define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \ callback(cpp_type, data_type); -#define _PtenForEachDataType_(callback) \ - _PtenForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ - _PtenForEachDataTypeHelper_( \ +#define _PhiForEachDataType_(callback) \ + _PhiForEachDataTypeHelper_(callback, float, DataType::FLOAT32); \ + _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::float16, DataType::FLOAT16); \ - _PtenForEachDataTypeHelper_( \ + _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::bfloat16, DataType::BFLOAT16); \ - _PtenForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ - _PtenForEachDataTypeHelper_(callback, int, DataType::INT32); \ - _PtenForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ - _PtenForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ - _PtenForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ - _PtenForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ - _PtenForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ - _PtenForEachDataTypeHelper_( \ + _PhiForEachDataTypeHelper_(callback, double, DataType::FLOAT64); \ + _PhiForEachDataTypeHelper_(callback, int, DataType::INT32); \ + _PhiForEachDataTypeHelper_(callback, int64_t, DataType::INT64); \ + _PhiForEachDataTypeHelper_(callback, bool, DataType::BOOL); \ + _PhiForEachDataTypeHelper_(callback, uint8_t, DataType::UINT8); \ + _PhiForEachDataTypeHelper_(callback, int16_t, DataType::INT16); \ + _PhiForEachDataTypeHelper_(callback, int8_t, DataType::INT8); \ + _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::complex, DataType::COMPLEX64); \ - _PtenForEachDataTypeHelper_( \ + _PhiForEachDataTypeHelper_( \ callback, ::phi::dtype::complex, DataType::COMPLEX128); template inline void VisitDataType(phi::DataType type, Visitor visitor) { -#define PtenVisitDataTypeCallback(cpp_type, data_type) \ - do { \ - if (type == data_type) { \ - visitor.template apply(); \ - return; \ - } \ +#define PhiVisitDataTypeCallback(cpp_type, data_type) \ + do { \ + if (type == data_type) { \ + visitor.template apply(); \ + return; \ + } \ } while (0) - _PtenForEachDataType_(PtenVisitDataTypeCallback); -#undef PtenVisitDataTypeCallback + _PhiForEachDataType_(PhiVisitDataTypeCallback); +#undef PhiVisitDataTypeCallback PADDLE_THROW(phi::errors::Unimplemented( "Not supported phi::DataType(%d) as data type.", static_cast(type))); } diff --git a/paddle/phi/kernels/diagonal_kernel.h b/paddle/phi/kernels/diagonal_kernel.h index f233ba2a95627..7cf7282307a4b 100644 --- a/paddle/phi/kernels/diagonal_kernel.h +++ b/paddle/phi/kernels/diagonal_kernel.h @@ -25,4 +25,4 @@ void DiagonalKernel(const Context& dev_ctx, int axis1, int axis2, DenseTensor* out); -} // pten +} // phi diff --git a/paddle/phi/kernels/digamma_grad_kernel.h b/paddle/phi/kernels/digamma_grad_kernel.h index b95d98895aa8e..38912a5ccc442 100644 --- a/paddle/phi/kernels/digamma_grad_kernel.h +++ b/paddle/phi/kernels/digamma_grad_kernel.h @@ -24,4 +24,4 @@ void DigammaGradKernel(const Context& ctx, const DenseTensor& x, DenseTensor* x_grad); -} // namepsace pten +} // namepsace phi diff --git a/paddle/phi/kernels/digamma_kernel.h b/paddle/phi/kernels/digamma_kernel.h index 1772a33e4ee4c..ce25f2e148e96 100644 --- a/paddle/phi/kernels/digamma_kernel.h +++ b/paddle/phi/kernels/digamma_kernel.h @@ -21,4 +21,4 @@ namespace phi { template void DigammaKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out); -} // namepsace pten +} // namepsace phi diff --git a/paddle/phi/kernels/expand_kernel.h b/paddle/phi/kernels/expand_kernel.h index eb32ed2456859..fb5a0112ffcf7 100644 --- a/paddle/phi/kernels/expand_kernel.h +++ b/paddle/phi/kernels/expand_kernel.h @@ -26,4 +26,4 @@ void ExpandKernel(const Context& ctx, const ScalarArray& shape, DenseTensor* out); -} // namepsace pten +} // namepsace phi diff --git a/paddle/phi/kernels/masked_select_grad_kernel.h b/paddle/phi/kernels/masked_select_grad_kernel.h index fd16091a665ca..f9db1fcd2acc7 100644 --- a/paddle/phi/kernels/masked_select_grad_kernel.h +++ b/paddle/phi/kernels/masked_select_grad_kernel.h @@ -24,4 +24,4 @@ void MaskedSelectGradKernel(const Context& dev_ctx, const DenseTensor& mask, DenseTensor* x_grad); -} // namspace pten +} // namspace phi diff --git a/paddle/phi/kernels/masked_select_kernel.h b/paddle/phi/kernels/masked_select_kernel.h index abd3c318986d8..471f650690d36 100644 --- a/paddle/phi/kernels/masked_select_kernel.h +++ b/paddle/phi/kernels/masked_select_kernel.h @@ -23,4 +23,4 @@ void MaskedSelectKernel(const Context& dev_ctx, const DenseTensor& mask, DenseTensor* out); -} // namspace pten +} // namspace phi diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc index c981ca1158507..60df877355b82 100644 --- a/paddle/phi/kernels/transfer_layout_kernel.cc +++ b/paddle/phi/kernels/transfer_layout_kernel.cc @@ -69,7 +69,7 @@ void TransferLayoutKernel(const Context& dev_ctx, } // namespace phi -PD_REGISTER_GENERAL_KERNEL(pten_transfer_layout, +PD_REGISTER_GENERAL_KERNEL(phi_transfer_layout, CPU, ALL_LAYOUT, phi::TransferLayoutKernel, diff --git a/paddle/phi/ops/compat/scale_sig.cc b/paddle/phi/ops/compat/scale_sig.cc index 915ea4ce302ae..95deb007d99d9 100644 --- a/paddle/phi/ops/compat/scale_sig.cc +++ b/paddle/phi/ops/compat/scale_sig.cc @@ -20,7 +20,7 @@ namespace phi { * Note [ Why does the ArgumentMapping function need to be so complicated? ] * * In order to meet the requirements of infrt, the function used to match Op - * and Kernel parameters, need to be placed in pten as a compatible component, + * and Kernel parameters, need to be placed in phi as a compatible component, * and does not depend on fluid. * * Because infrt not only needs to dynamically call this argument mapping diff --git a/paddle/phi/tests/api/scale_api.h b/paddle/phi/tests/api/scale_api.h index 829b93b88b4f9..d93f00129b9a1 100644 --- a/paddle/phi/tests/api/scale_api.h +++ b/paddle/phi/tests/api/scale_api.h @@ -71,7 +71,7 @@ PADDLE_API Tensor scale_kernel_context(const Tensor& x, auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPtenPlace(kernel_backend)), + phi::TransToPhiPlace(kernel_backend)), phi::DenseTensorMeta()); phi::MetaTensor meta_out(dense_out.get()); phi::UnchangedInferMeta(*dense_x, &meta_out); @@ -238,7 +238,7 @@ Tensor scale_switch_case(const Tensor& x, auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPtenPlace(kernel_backend)), + phi::TransToPhiPlace(kernel_backend)), phi::DenseTensorMeta()); phi::MetaTensor meta_out(dense_out.get()); phi::UnchangedInferMeta(*dense_x, &meta_out); diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc index 2e38a1593461e..a3c497bd427ae 100644 --- a/paddle/phi/tests/api/test_data_transform.cc +++ b/paddle/phi/tests/api/test_data_transform.cc @@ -83,7 +83,7 @@ TEST(Tensor, data_transform_diff_place) { ASSERT_EQ(out.layout(), phi::DataLayout::NCHW); ASSERT_EQ(out.initialized(), true); ASSERT_EQ(out.impl()->place(), - phi::TransToPtenPlace(experimental::Backend::GPU)); + phi::TransToPhiPlace(experimental::Backend::GPU)); auto ref_out = experimental::copy_to(out, experimental::Backend::CPU, true); diff --git a/paddle/phi/tests/api/test_pten_tensor.cc b/paddle/phi/tests/api/test_pten_tensor.cc index de88561c4d675..dc2883c1794e2 100644 --- a/paddle/phi/tests/api/test_pten_tensor.cc +++ b/paddle/phi/tests/api/test_pten_tensor.cc @@ -211,7 +211,7 @@ void TestJudgeTensorType() { CHECK(test_tensor.is_dense_tensor() == true); } -TEST(PtenTensor, All) { +TEST(PhiTensor, All) { VLOG(2) << "TestCopy"; GroupTestCopy(); VLOG(2) << "TestDtype"; diff --git a/paddle/phi/tests/common/test_place.cc b/paddle/phi/tests/common/test_place.cc index c311a6733b04d..ed2eb7126ed28 100644 --- a/paddle/phi/tests/common/test_place.cc +++ b/paddle/phi/tests/common/test_place.cc @@ -20,7 +20,7 @@ limitations under the License. */ namespace phi { namespace tests { -TEST(PtenPlace, place) { +TEST(PhiPlace, place) { phi::Place place; EXPECT_EQ(place.GetType(), phi::AllocationType::UNDEFINED); diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc index bc75e6ec45245..d8e42c9d0d8b1 100644 --- a/paddle/phi/tests/core/test_custom_kernel.cc +++ b/paddle/phi/tests/core/test_custom_kernel.cc @@ -148,9 +148,9 @@ TEST(CustomKernel, custom_kernel_dot) { // 3.before register auto& kernel_factory_instance = phi::KernelFactory::Instance(); auto& kernels = phi::KernelFactory::Instance().kernels(); - EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePtenKernel(op_name)); + EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name)); - // mock fake_dot is supported by phi for HasCompatiblePtenKernel check while + // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while // registering auto& fake_dot_kernels = kernels[op_name]; @@ -251,7 +251,7 @@ TEST(CustomKernel, custom_kernel_dot) { phi::dtype::float16 fake_attr_f16 = phi::dtype::float16(5); phi::DataType fake_attr_dtype = phi::DataType::UINT32; paddle::framework::LoDTensor tmp_tensor; - tmp_tensor.mutable_data({1}, phi::TransToPtenPlace(backend)); + tmp_tensor.mutable_data({1}, phi::TransToPhiPlace(backend)); phi::Scalar fake_attr_scalar{tmp_tensor}; phi::ScalarArray fake_attr_scalar_array; std::vector fake_attr_int64_vec; @@ -271,7 +271,7 @@ TEST(CustomKernel, custom_kernel_dot) { auto dense_out = std::make_shared( phi::make_intrusive( - phi::TransToPtenPlace(backend)), + phi::TransToPhiPlace(backend)), phi::DenseTensorMeta()); phi::MetaTensor meta_out(dense_out.get()); diff --git a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc index 6e0b44b71f7f8..76158596cb815 100644 --- a/python/paddle/fluid/tests/custom_op/custom_linear_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_linear_op.cc @@ -17,9 +17,9 @@ limitations under the License. */ #include "paddle/extension.h" // The linear implemented here must be passed in bias -std::vector PtenLinearForward(const paddle::Tensor& x, - const paddle::Tensor& weight, - const paddle::Tensor& bias) { +std::vector PhiLinearForward(const paddle::Tensor& x, + const paddle::Tensor& weight, + const paddle::Tensor& bias) { return { paddle::experimental::add(paddle::experimental::matmul(x, weight), bias)}; } @@ -90,6 +90,6 @@ std::vector LinearInferDtype( PD_BUILD_OP(pten_linear) .Inputs({"X", "Weight", "Bias"}) .Outputs({"Out"}) - .SetKernelFn(PD_KERNEL(PtenLinearForward)) + .SetKernelFn(PD_KERNEL(PhiLinearForward)) .SetInferShapeFn(PD_INFER_SHAPE(LinearInferShape)) .SetInferDtypeFn(PD_INFER_DTYPE(LinearInferDtype)); diff --git a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py index 815598d901766..a429717bdaf37 100644 --- a/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py +++ b/python/paddle/fluid/tests/unittests/test_get_all_registered_op_kernels.py @@ -19,13 +19,13 @@ class TestGetAllRegisteredOpKernels(unittest.TestCase): - # reshape kernel is in fluid while not in pten - def test_pten_kernels(self): - self.assertTrue(core._get_all_register_op_kernels('pten')['sign']) + # reshape kernel is in fluid while not in phi + def test_phi_kernels(self): + self.assertTrue(core._get_all_register_op_kernels('phi')['sign']) with self.assertRaises(KeyError): - core._get_all_register_op_kernels('pten')['reshape'] + core._get_all_register_op_kernels('phi')['reshape'] - # sign kernel is removed from fluid and added into pten + # sign kernel is removed from fluid and added into phi def test_fluid_kernels(self): self.assertTrue(core._get_all_register_op_kernels('fluid')['reshape']) with self.assertRaises(KeyError): diff --git a/python/setup.py.in b/python/setup.py.in index f39429387dbc3..ec1b1cbcb9510 100755 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -571,13 +571,13 @@ def find_files(pattern, root, recursive=False): headers = ( # paddle level api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) + - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) + # pten unify api header + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api')) + # phi unify api header list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/ext')) + # custom op api - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) + # pten api - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) + # pten common headers - # pten level api headers (low level api) - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # pten core headers - list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) + # pten backends headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/api/include')) + # phi api + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/common')) + # phi common headers + # phi level api headers (low level api) + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/core', recursive=True)) + # phi core headers + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/phi/backends', recursive=True)) + # phi backends headers # utila api headers list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/utils', recursive=True)) + # paddle utils headers ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/device/device_ext.h']) From 3cb93edfc35aeb67ed94b8a46979e27aaef92eed Mon Sep 17 00:00:00 2001 From: zhangchunle Date: Mon, 28 Feb 2022 13:13:12 +0800 Subject: [PATCH 83/85] PR-CI-Py3 change cpu test (#39659) * update;test=cpu-py3 --- CMakeLists.txt | 1 + paddle/fluid/operators/math/CMakeLists.txt | 1 + paddle/scripts/paddle_build.sh | 157 +++++++++++++++++- .../fluid/tests/unittests/CMakeLists.txt | 7 +- .../distributed_passes/CMakeLists.txt | 1 + 5 files changed, 162 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a4c1b9c8098e9..5b499fb43ab99 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,6 +330,7 @@ if(WITH_BRPC_RDMA) endif() endif() + if(WITH_GPU) include(cuda) # lite subgraph compilation depends on CUDNN_ROOT, diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index d7d1093b9b3bf..ac6566a87030d 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -42,6 +42,7 @@ endif() math_library(fc DEPS blas jit_kernel_helper) math_library(matrix_bit_code) + math_library(unpooling) math_library(vol2col) math_library(prelu) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 7f2ad893f67a3..2ab7a66d738db 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -713,10 +713,159 @@ EOF fi } +function run_linux_cpu_test() { + mkdir -p ${PADDLE_ROOT}/build + cd ${PADDLE_ROOT}/build + pip install hypothesis + pip install ${PADDLE_ROOT}/build/python/dist/*whl + cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python + ut_total_startTime_s=`date +%s` + if [ ${WITH_TESTING:-ON} == "ON" ] ; then + cat <> ${PADDLE_ROOT}/build/build_summary.txt + ut_actual_total_endTime_s=`date +%s` + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" + echo "ipipe_log_param_actual_TestCases_Total_Time: $[ $ut_actual_total_endTime_s - $ut_actual_total_startTime_s ]s" >> ${PADDLE_ROOT}/build/build_summary.txt + if [[ "$EXIT_CODE" != "0" ]]; then + show_ut_retry_result + fi +set -ex + fi +} function get_precision_ut_mac() { on_precision=0 UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') - precison_cases="" + precision_cases="" if [ ${PRECISION_TEST:-OFF} == "ON" ]; then python3.7 $PADDLE_ROOT/tools/get_pr_ut.py if [[ -f "ut_list" ]]; then @@ -2691,9 +2840,11 @@ function main() { cmake_gen ${PYTHON_ABI:-""} build_mac ;; - cicheck_py35) + cicheck_py37) cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} - parallel_test + run_linux_cpu_test ${PYTHON_ABI:-""} ${PROC_RUN:-1} + + #parallel_test ;; cpu_cicheck_py35) cmake_gen_and_build ${PYTHON_ABI:-""} ${parallel_number} diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index ad0a81e725707..2361bd2706238 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -590,7 +590,10 @@ foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach(TEST_OP) py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4) -py_test_modules(test_warpctc_op MODULES test_warpctc_op) +if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL) + py_test_modules(test_warpctc_op MODULES test_warpctc_op) + set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120) +endif() py_test_modules(test_bilinear_interp_op MODULES test_bilinear_interp_op ENVS ${GC_ENVS}) py_test_modules(test_nearest_interp_op MODULES test_nearest_interp_op ENVS ${GC_ENVS}) py_test_modules(test_imperative_resnet MODULES test_imperative_resnet ENVS @@ -935,7 +938,7 @@ set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 200) set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120) set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120) -set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120) + set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120) set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120) set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt index 2bea60c3ded1a..729c9c46b4f0c 100755 --- a/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/distributed_passes/CMakeLists.txt @@ -10,6 +10,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_XPU) AND NOT (WITH_ASCEND OR WITH_ASCEND_CL)) list(REMOVE_ITEM TEST_OPS "test_dist_fuse_relu_depthwise_conv_pass") list(REMOVE_ITEM TEST_OPS "test_dist_fuse_sgd_pass") list(REMOVE_ITEM TEST_OPS "test_dist_gradient_merge_pass") + list(REMOVE_ITEM TEST_OPS "test_dist_inplace_addto_pass") list(REMOVE_ITEM TEST_OPS "test_auto_parallel_amp_pass") list(REMOVE_ITEM TEST_OPS "test_auto_parallel_recompute_pass") list(REMOVE_ITEM TEST_OPS "test_auto_parallel_sharding_pass") From 35471b1f8af735aad079ca2787c881499d7829d6 Mon Sep 17 00:00:00 2001 From: Shang Zhizhou Date: Mon, 28 Feb 2022 13:20:38 +0800 Subject: [PATCH 84/85] =?UTF-8?q?=E3=80=90infrt=E3=80=91add=20TrtOpConvert?= =?UTF-8?q?erPass=20(#39902)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add some trt layers * trtOpConverter pass ok * add comments * add constraints to some attrs in the pd_lower_to_trt patterns * update constraint * fix code style * update pass name * update code style * change .hpp.inc to .cc.inc in mlir_add_rewriter --- cmake/external/llvm.cmake | 2 +- paddle/infrt/CMakeLists.txt | 1 + paddle/infrt/dialect/infrt_base.h | 14 +++++ paddle/infrt/dialect/infrt_base.td | 6 ++ paddle/infrt/dialect/pd_ops.cc | 4 +- paddle/infrt/dialect/tensorrt/CMakeLists.txt | 2 + .../infrt/dialect/tensorrt/pd_lower_to_trt.td | 28 +++++++++ paddle/infrt/dialect/tensorrt/trt_exec.cc | 8 ++- .../dialect/tensorrt/trt_graph_fuse_pass.cc | 2 +- .../dialect/tensorrt/trt_graph_fuse_pass.h | 4 +- .../dialect/tensorrt/trt_graph_split_pass.cc | 2 +- .../dialect/tensorrt/trt_graph_split_pass.h | 6 +- .../dialect/tensorrt/trt_op_converter_pass.cc | 51 ++++++++++++++++ .../dialect/tensorrt/trt_op_converter_pass.h | 59 +++++++++++++++++++ .../dialect/tensorrt/trt_op_teller_pass.cc | 2 +- .../dialect/tensorrt/trt_op_teller_pass.h | 4 +- paddle/infrt/dialect/tensorrt/trt_ops.td | 42 ++++++++++++- .../infrt/tests/dialect/disabled_trt_ops.mlir | 6 +- 18 files changed, 223 insertions(+), 20 deletions(-) create mode 100644 paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td create mode 100644 paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc create mode 100644 paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h diff --git a/cmake/external/llvm.cmake b/cmake/external/llvm.cmake index 27210e5260048..a7a9e85ffd731 100644 --- a/cmake/external/llvm.cmake +++ b/cmake/external/llvm.cmake @@ -99,7 +99,7 @@ endfunction() function(mlir_add_rewriter td_base) set(LLVM_TARGET_DEFINITIONS ${td_base}.td) - mlir_tablegen(${td_base}.hpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") + mlir_tablegen(${td_base}.cpp.inc -gen-rewriters "-I${CMAKE_SOURCE_DIR}/infrt/dialect/pass") add_public_tablegen_target(${td_base}_IncGen) add_custom_target(${td_base}_inc DEPENDS ${td_base}_IncGen) endfunction() diff --git a/paddle/infrt/CMakeLists.txt b/paddle/infrt/CMakeLists.txt index dc22eecc99cdd..f2768f3dfa88d 100644 --- a/paddle/infrt/CMakeLists.txt +++ b/paddle/infrt/CMakeLists.txt @@ -97,6 +97,7 @@ set(infrt_mlir_incs pd_extra_ops_inc rewrite_inc trt_ops_inc + pd_lower_to_trt_inc ) if (INFRT_WITH_PHI) diff --git a/paddle/infrt/dialect/infrt_base.h b/paddle/infrt/dialect/infrt_base.h index a8e7e13a681ca..3ef73171dcdea 100644 --- a/paddle/infrt/dialect/infrt_base.h +++ b/paddle/infrt/dialect/infrt_base.h @@ -54,6 +54,20 @@ static mlir::IntegerAttr createI32Attr(mlir::OpBuilder &b, // NOLINT return b.getIntegerAttr(b.getI32Type(), constant); } +template +static mlir::IntegerAttr createSI32Attr(mlir::OpBuilder &b, // NOLINT + mlir::Location loc, + T constant) { + return b.getSI32IntegerAttr(constant); +} + +template +static mlir::FloatAttr createF32Attr(mlir::OpBuilder &b, // NOLINT + mlir::Location loc, + T constant) { + return b.getF32FloatAttr(constant); +} + static mlir::SmallVector cvtValueToValueRange( const mlir::Value &operand) { return mlir::SmallVector(1, operand); diff --git a/paddle/infrt/dialect/infrt_base.td b/paddle/infrt/dialect/infrt_base.td index 4d4727ee8e185..0f50eb2d8fb4a 100644 --- a/paddle/infrt/dialect/infrt_base.td +++ b/paddle/infrt/dialect/infrt_base.td @@ -28,6 +28,12 @@ def BufferType : OpaqueType<"b", "buffer", "buffer">; class INFRT_createI32Attr : NativeCodeCall< "infrt::createI32Attr($_builder, $_loc, " # value # ")">; +class INFRT_createSI32Attr : NativeCodeCall< + "infrt::createSI32Attr($_builder, $_loc, " # value # ")">; + +class INFRT_createF32Attr : NativeCodeCall< + "infrt::createF32Attr($_builder, $_loc, " # value # ")">; + def INFRT_cvtValueToValueRange : NativeCodeCall< "infrt::cvtValueToValueRange($0)">; diff --git a/paddle/infrt/dialect/pd_ops.cc b/paddle/infrt/dialect/pd_ops.cc index 7cf5b2fb20f52..338b04e001320 100644 --- a/paddle/infrt/dialect/pd_ops.cc +++ b/paddle/infrt/dialect/pd_ops.cc @@ -24,11 +24,11 @@ #define GET_OP_CLASSES #include "paddle/infrt/dialect/pd_extra_ops.cpp.inc" // NOLINT -#include "paddle/infrt/dialect/rewrite.hpp.inc" // NOLINT - namespace mlir { namespace pd { +#include "paddle/infrt/dialect/rewrite.cpp.inc" // NOLINT + PaddleDialect::PaddleDialect(MLIRContext *context) : Dialect("pd", context, TypeID::get()) { addOperations< diff --git a/paddle/infrt/dialect/tensorrt/CMakeLists.txt b/paddle/infrt/dialect/tensorrt/CMakeLists.txt index 794266513eb81..99c335ed1782e 100755 --- a/paddle/infrt/dialect/tensorrt/CMakeLists.txt +++ b/paddle/infrt/dialect/tensorrt/CMakeLists.txt @@ -2,11 +2,13 @@ core_gather_headers() gather_srcs(infrt_src SRCS trt_ops.cc + trt_op_converter_pass.cc trt_op_teller_pass.cc trt_graph_fuse_pass.cc trt_graph_split_pass.cc ) mlir_tablegen_on(trt_ops) +mlir_add_rewriter(pd_lower_to_trt) add_executable(trt-exec trt_exec.cc) target_link_libraries(trt-exec infrt ${MLIR_IR_LIBS}) diff --git a/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td new file mode 100644 index 0000000000000..701391a750354 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/pd_lower_to_trt.td @@ -0,0 +1,28 @@ +#ifndef PD_LOWER_TO_TRT +#define PD_LOWER_TO_TRT + +include "mlir/Interfaces/SideEffectInterfaces.td" +include "paddle/infrt/dialect/infrt_base.td" +include "paddle/infrt/dialect/pd_ops.td" +include "paddle/infrt/dialect/tensorrt/trt_ops.td" + +def PD2TRT_Matmul_Lower : Pat< + (PD_MatmulOp $X, $Y, $transpose_X, $transpose_Y, ConstantAttr, ConstantAttr), + (TRT_MatrixMultiplyOp $X, $transpose_X, $Y, $transpose_Y)>; + +//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ElementWiseOperation::kSUM +def PD2TRT_ElementwiseAdd_Lower : Pat< + (PD_Elementwise_addOp $X, $Y, ConstantAttr), + (TRT_ElementWiseOp $X, $Y, (INFRT_createSI32Attr<"0">)/*kSUM*/)>; + +//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ActivationType::kRELU +def PD2TRT_Relu_Lower : Pat< + (PD_ReluOp $X), + (TRT_ActivationOp $X, (INFRT_createSI32Attr<"0">)/*kRELU*/, (INFRT_createF32Attr<"0.0">), (INFRT_createF32Attr<"0.0">))>; + +//TO DO(shangzhizhou):replace '"INFRT_createI32Attr<"0">' to enum nvinfer1::ActivationType::kCLIP +def PD2TRT_Relu6_Lower : Pat< + (PD_Relu6Op $X, $threshold), + (TRT_ActivationOp $X, (INFRT_createSI32Attr<"8">)/*kCLIP*/, (INFRT_createF32Attr<"0.0">), $threshold)>; + +#endif // PD_LOWER_TO_TRT diff --git a/paddle/infrt/dialect/tensorrt/trt_exec.cc b/paddle/infrt/dialect/tensorrt/trt_exec.cc index 1baef7a3f77fd..7af1fa53d12e3 100644 --- a/paddle/infrt/dialect/tensorrt/trt_exec.cc +++ b/paddle/infrt/dialect/tensorrt/trt_exec.cc @@ -19,6 +19,7 @@ #include "paddle/infrt/dialect/mlir_loader.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" #include "paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h" int main(int argc, char** argv) { @@ -36,9 +37,10 @@ int main(int argc, char** argv) { mlir::PassManager pm(context); mlir::OpPassManager& trt_pass_manager = pm.nest(); - trt_pass_manager.addPass(std::make_unique()); - trt_pass_manager.addPass(std::make_unique()); - trt_pass_manager.addPass(std::make_unique(10)); + trt_pass_manager.addPass(std::make_unique()); + trt_pass_manager.addPass(std::make_unique()); + trt_pass_manager.addPass(std::make_unique(1)); + trt_pass_manager.addPass(std::make_unique()); if (mlir::failed(pm.run(*module))) { std::cout << "\npass failed!\n" << std::endl; return 4; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc index 1da80ef2c3b10..17633a4e8e992 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.cc @@ -142,7 +142,7 @@ void topoSortBlock(mlir::Block &body) { // NOLINT } // namespace // Implementation of the trtGraphFusePass. -void trtGraphFusePass::runOnFunction() { +void TRTGraphFusePass::runOnFunction() { mlir::Block &body = getFunction().front(); mlir::OpBuilder builder(&body, body.begin()); bool changed = false; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h index f1e555c6f67ec..ebd7a4ac4bd37 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_fuse_pass.h @@ -52,8 +52,8 @@ namespace trt { * "pd.fetch" %d, %f * } */ -class trtGraphFusePass - : public mlir::PassWrapper { +class TRTGraphFusePass + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtGraphFusePass"; } void runOnFunction() override; diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc index 257f2b5285425..f24b9cc40cdcc 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.cc @@ -21,7 +21,7 @@ namespace infrt { namespace trt { // Implementation of the trtGraphSplitPass。 -void trtGraphSplitPass::runOnFunction() { +void TRTGraphSplitPass::runOnFunction() { std::vector worklist; mlir::Block& block = getFunction().front(); for (auto& op : block) { diff --git a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h index d30d186647fc3..51f8422724340 100644 --- a/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_graph_split_pass.h @@ -45,12 +45,12 @@ namespace trt { * "pd.fetch" (%d, %f) * } */ -class trtGraphSplitPass - : public mlir::PassWrapper { +class TRTGraphSplitPass + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtGraphSplitPass"; } void runOnFunction() override; - explicit trtGraphSplitPass(size_t min_subgraph_size = 3) + explicit TRTGraphSplitPass(size_t min_subgraph_size = 3) : min_subgraph_size_(min_subgraph_size) {} private: diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc new file mode 100644 index 0000000000000..e34308a2f0fa8 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.cc @@ -0,0 +1,51 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h" +#include "mlir/IR/Builders.h" +#include "mlir/Transforms/DialectConversion.h" +#include "paddle/infrt/dialect/infrt_base.h" +#include "paddle/infrt/dialect/pd_ops.h" + +namespace infrt { +namespace trt { + +#include "paddle/infrt/dialect/tensorrt/pd_lower_to_trt.cpp.inc" // NOLINT + +using namespace mlir; + +void TRTOpConverterPass::runOnOperation() { + // The first thing to define is the conversion target. This will define the + // final target for this lowering. + ConversionTarget target(getContext()); + + // We define the specific operations, or dialects, that are legal targets for + // this lowering. In our case, we are lowering to TensorRTDialect from + // PaddleDialect + target.addLegalDialect(); + + // Now that the conversion target has been defined, we just need to provide + // the set of patterns that will lower the TensorRT operations. + RewritePatternSet patterns(&getContext()); + populateWithGenerated(patterns); + + // With the target and rewrite patterns defined, we can now attempt the + // conversion. The conversion will signal failure if any of our `illegal` + // operations were not converted successfully. + if (failed( + applyPartialConversion(getOperation(), target, std::move(patterns)))) + signalPassFailure(); +} + +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h new file mode 100644 index 0000000000000..0adbf11b89144 --- /dev/null +++ b/paddle/infrt/dialect/tensorrt/trt_op_converter_pass.h @@ -0,0 +1,59 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "mlir/IR/Dialect.h" +#include "mlir/Pass/Pass.h" +#include "paddle/infrt/dialect/tensorrt/trt_ops.h" + +namespace infrt { +namespace trt { +/* + * trtOpConverterPass. + * + * source ir: + * func @main() -> tensor { + * %a = "pd.feed"()... + * %d, %f = "pd.graph"(%a) { + * %m = "pd.conv2d"(%a)... + * %n = "pd.conv3d"(%m)... + * %s = "pd.conv2d"(%a)... + * "pd.return" %n, %s + * } ... + * "pd.fetch" %d, %f + * } + * + * destination ir: + * func @main() -> tensor { + * %a = "pd.feed"()... + * %d, %f = "pd.graph"(%a) { + * %m = "trt.Convolution"(%a)... + * %n = "trt.Convolution"(%m)... + * %s = "trt.Convolution"(%a)... + * "pd.return" %n, %s + * } ... + * "pd.fetch" %d, %f + * } + */ +struct TRTOpConverterPass + : public mlir::PassWrapper> { + void getDependentDialects(mlir::DialectRegistry ®istry) const override { + registry.insert(); + } + ::llvm::StringRef getName() const override { return "trtOpConverterPass"; } + void runOnOperation() final; +}; +} // namespace trt +} // namespace infrt diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc index 4e8d40b982b2e..176fdb7a2e054 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.cc @@ -20,7 +20,7 @@ namespace infrt { namespace trt { // Implementation of the trtOpTellerPass。 -void trtOpTellerPass::runOnFunction() { +void TRTOpTellerPass::runOnFunction() { mlir::Block &body = getFunction().front(); std::vector worklist; worklist.reserve(body.getOperations().size()); diff --git a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h index fb16c974f7fb3..8b9a16376ce55 100644 --- a/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h +++ b/paddle/infrt/dialect/tensorrt/trt_op_teller_pass.h @@ -52,8 +52,8 @@ namespace trt { * TODO(winter-wang): Supplementary how to judge the operators can be supported * by tensorrt. */ -class trtOpTellerPass - : public mlir::PassWrapper { +class TRTOpTellerPass + : public mlir::PassWrapper { public: ::llvm::StringRef getName() const override { return "trtOpTellerPass"; } void runOnFunction() override; diff --git a/paddle/infrt/dialect/tensorrt/trt_ops.td b/paddle/infrt/dialect/tensorrt/trt_ops.td index cc072b6e6885b..8e3dfffff54f1 100755 --- a/paddle/infrt/dialect/tensorrt/trt_ops.td +++ b/paddle/infrt/dialect/tensorrt/trt_ops.td @@ -23,8 +23,48 @@ def TRT_GraphOp : TRT_Op<"graph", [SingleBlockImplicitTerminator<"FetchOp">]> { Describe a tensorrt subgraph. }]; let regions = (region SizedRegion<1>:$body); - + let arguments = (ins Variadic:$inputs); let results = (outs Variadic:$outputs); } + +def TRT_ActivationOp : TRT_Op<"Activation", [NoSideEffect]> { + let summary = "TensorRT IActivationLayer"; + let description = [{ + + TensorRT IActivationLayer. + + }]; + let arguments = (ins TRT_Tensor:$input, SI32Attr:$activation_type, + DefaultValuedAttr:$alpha, + DefaultValuedAttr:$beta); + + let results = (outs TRT_Tensor:$output); +} + +def TRT_ElementWiseOp : TRT_Op<"ElementWise", [NoSideEffect]> { + let summary = "TensorRT IElementWiseLayer"; + let description = [{ + + TensorRT IElementWiseLayer. + + }]; + let arguments = (ins TRT_Tensor:$input1, TRT_Tensor:$input2, SI32Attr:$elementwise_operation); + + let results = (outs TRT_Tensor:$output); +} + +def TRT_MatrixMultiplyOp : TRT_Op<"MatrixMultiply", [NoSideEffect]> { + let summary = "TensorRT IMatrixMultiplyLayer"; + let description = [{ + + TensorRT IMatrixMultiplyLayer. + + }]; + let arguments = (ins TRT_Tensor:$input1, BoolAttr:$transpose1, + TRT_Tensor:$input2, BoolAttr:$transpose2); + + let results = (outs TRT_Tensor:$output); +} + #endif // TRT_OPS diff --git a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir index 75ec98f04661a..b59cfb0481697 100644 --- a/paddle/infrt/tests/dialect/disabled_trt_ops.mlir +++ b/paddle/infrt/tests/dialect/disabled_trt_ops.mlir @@ -7,15 +7,15 @@ func @main() -> tensor { %bias1 = "pd.feed"() {name="input4"} : () -> tensor %bias2 = "pd.feed"() {name="input5"} : () -> tensor - %d = "pd.elementwise_add"(%c, %bias) {axis=1:si32} : (tensor, tensor) -> tensor + %d = "pd.elementwise_add"(%c, %bias) {axis=-1:si32} : (tensor, tensor) -> tensor %e = "pd.relu6"(%d) {} : (tensor) -> tensor %c1 = "pd.matmul"(%e, %b1) {transpose_x=false, transpose_y=false} : (tensor, tensor) -> tensor - %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=1:si32} : (tensor, tensor) -> tensor + %d1 = "pd.elementwise_add"(%c1, %bias1) {axis=-1:si32} : (tensor, tensor) -> tensor %e1 = "pd.relu"(%d1) {} : (tensor) -> tensor %c2 = "pd.matmul"(%e1, %b2) {transpose_x=true, transpose_y=false} : (tensor, tensor) -> tensor - %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=1:si32} : (tensor, tensor) -> tensor + %d2 = "pd.elementwise_add"(%c2, %bias2) {axis=-1:si32} : (tensor, tensor) -> tensor %e2 = "pd.relu"(%d2) {} : (tensor) -> tensor "pd.fetch"(%e2) {name="output"} :(tensor)->() From 23aa7a368364659ab02ca3c21af7637d909f736f Mon Sep 17 00:00:00 2001 From: furnace <34057289+windstamp@users.noreply.github.com> Date: Mon, 28 Feb 2022 13:45:18 +0800 Subject: [PATCH 85/85] [Phi] move truncated_gaussian_random kernel (#39971) * [Phi] move truncated_gaussian_random, copy kernels * [Phi] move truncated_gaussian_random, kernel register * [Phi] move truncated_gaussian_random, delete useless codes --- .../operators/truncated_gaussian_random_op.cc | 24 --- .../operators/truncated_gaussian_random_op.cu | 128 ------------- .../cpu/truncated_gaussian_random_kernel.cc | 57 ++++++ .../gpu/truncated_gaussian_random_kernel.cu | 139 +++++++++++++++ .../truncated_gaussian_random_kernel.h | 168 ++++++++++++++++++ .../compat/truncated_gaussian_random_sig.cc | 30 ++++ 6 files changed, 394 insertions(+), 152 deletions(-) delete mode 100644 paddle/fluid/operators/truncated_gaussian_random_op.cu create mode 100644 paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc create mode 100644 paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu create mode 100644 paddle/phi/kernels/truncated_gaussian_random_kernel.h create mode 100644 paddle/phi/ops/compat/truncated_gaussian_random_sig.cc diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc index f980e007271e3..6eb7f922dfdbe 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc @@ -23,28 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -class CPUTruncatedGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - float mean = context.Attr("mean"); - float std = context.Attr("std"); - auto* tensor = context.Output("Out"); - T* data = tensor->mutable_data(context.GetPlace()); - - std::uniform_real_distribution dist(std::numeric_limits::min(), - 1.0); - TruncatedNormal truncated_normal(mean, std); - int64_t size = tensor->numel(); - - unsigned int seed = static_cast(context.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - data[i] = truncated_normal(dist(*engine)); - } - } -}; - class TruncatedGaussianRandomOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -124,5 +102,3 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(truncated_gaussian_random, ops::TruncatedGaussianRandomOp, ops::TruncatedGaussianRandomOpMaker); -REGISTER_OP_CPU_KERNEL(truncated_gaussian_random, - ops::CPUTruncatedGaussianRandomKernel); diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu deleted file mode 100644 index 5e530a5bb5248..0000000000000 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cu +++ /dev/null @@ -1,128 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include -#include -#include -#include -#include "paddle/fluid/framework/generator.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/operators/truncated_gaussian_random_op.h" - -namespace paddle { -namespace operators { - -template -struct GPUTruncatedNormal { - T mean, std; - T a_normal_cdf; - T b_normal_cdf; - unsigned int seed; - T numeric_min; - - __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) - : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; - } - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); - rng.discard(n); - T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; - } -}; - -template -struct TruncatedNormalOffset { - T mean, std; - T a_normal_cdf; - T b_normal_cdf; - unsigned int seed; - T numeric_min; - int offset_; - - __host__ __device__ TruncatedNormalOffset(T mean, T std, T numeric_min, - int seed, int offset) - : mean(mean), - std(std), - seed(seed), - numeric_min(numeric_min), - offset_(offset) { - a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; - b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; - } - - __host__ __device__ T operator()(const unsigned int n) const { - thrust::minstd_rand rng; - rng.seed(seed); - thrust::uniform_real_distribution dist(numeric_min, 1); - rng.discard(n + offset_); - T value = dist(rng); - auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; - return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; - } -}; - -template -class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* tensor = context.Output("Out"); - T* data = tensor->mutable_data(context.GetPlace()); - - unsigned int seed = static_cast(context.Attr("seed")); - bool seed_flag = false; - if (seed == 0) { - std::random_device rd; - seed = rd(); - seed_flag = true; - } - T mean = static_cast(context.Attr("mean")); - T std = static_cast(context.Attr("std")); - thrust::counting_iterator index_sequence_begin(0); - int64_t size = tensor->numel(); - - int device_id = context.GetPlace().GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if (gen_cuda->GetIsInitPy() && seed_flag) { - auto seed_offset = gen_cuda->IncrementOffset(1); - int64_t gen_offset = size * seed_offset.second; - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - TruncatedNormalOffset(mean, std, std::numeric_limits::min(), - seed_offset.first, gen_offset)); - } else { - thrust::transform(index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - GPUTruncatedNormal( - mean, std, std::numeric_limits::min(), seed)); - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL( - truncated_gaussian_random, - paddle::operators::GPUTruncatedGaussianRandomKernel); diff --git a/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc new file mode 100644 index 0000000000000..ebc032ef54538 --- /dev/null +++ b/paddle/phi/kernels/cpu/truncated_gaussian_random_kernel.cc @@ -0,0 +1,57 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h" + +#include +#include +#include + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/generator.h" + +namespace phi { + +template +void TruncatedGaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + T* data = dev_ctx.template Alloc(tensor); + + std::uniform_real_distribution dist(std::numeric_limits::min(), + 1.0); + TruncatedNormal truncated_normal(mean, std); + int64_t size = tensor->numel(); + + auto engine = paddle::framework::GetCPURandomEngine(seed); + for (int64_t i = 0; i < size; ++i) { + data[i] = truncated_normal(dist(*engine)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(truncated_gaussian_random, + CPU, + ALL_LAYOUT, + phi::TruncatedGaussianRandomKernel, + float) {} diff --git a/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu new file mode 100644 index 0000000000000..12c1bf791e169 --- /dev/null +++ b/paddle/phi/kernels/gpu/truncated_gaussian_random_kernel.cu @@ -0,0 +1,139 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/truncated_gaussian_random_kernel.h" + +#include +#include +#include +#include +#include + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/kernel_registry.h" + +#include "paddle/fluid/framework/generator.h" +// #include "paddle/phi/core/generator.h" + +namespace phi { + +template +struct GPUTruncatedNormal { + T mean, std; + T a_normal_cdf; + T b_normal_cdf; + unsigned int seed; + T numeric_min; + + __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) + : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + } + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed); + thrust::uniform_real_distribution dist(numeric_min, 1); + rng.discard(n); + T value = dist(rng); + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + } +}; + +template +struct TruncatedNormalOffset { + T mean, std; + T a_normal_cdf; + T b_normal_cdf; + unsigned int seed; + T numeric_min; + int offset_; + + __host__ __device__ + TruncatedNormalOffset(T mean, T std, T numeric_min, int seed, int offset) + : mean(mean), + std(std), + seed(seed), + numeric_min(numeric_min), + offset_(offset) { + a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; + b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; + } + + __host__ __device__ T operator()(const unsigned int n) const { + thrust::minstd_rand rng; + rng.seed(seed); + thrust::uniform_real_distribution dist(numeric_min, 1); + rng.discard(n + offset_); + T value = dist(rng); + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean; + } +}; + +template +void TruncatedGaussianRandomKernel(const Context& dev_ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out) { + auto tensor = out; + + T* data = dev_ctx.template Alloc(tensor); + + bool seed_flag = false; + if (seed == 0) { + std::random_device rd; + seed = rd(); + seed_flag = true; + } + + thrust::counting_iterator index_sequence_begin(0); + int64_t size = tensor->numel(); + + int device_id = dev_ctx.GetPlace().GetDeviceId(); + auto gen_cuda = paddle::framework::GetDefaultCUDAGenerator(device_id); + + if (gen_cuda->GetIsInitPy() && seed_flag) { + auto seed_offset = gen_cuda->IncrementOffset(1); + int64_t gen_offset = size * seed_offset.second; + thrust::transform(index_sequence_begin, + index_sequence_begin + size, + thrust::device_ptr(data), + TruncatedNormalOffset(mean, + std, + std::numeric_limits::min(), + seed_offset.first, + gen_offset)); + } else { + thrust::transform( + index_sequence_begin, + index_sequence_begin + size, + thrust::device_ptr(data), + GPUTruncatedNormal(mean, std, std::numeric_limits::min(), seed)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(truncated_gaussian_random, + GPU, + ALL_LAYOUT, + phi::TruncatedGaussianRandomKernel, + float) {} diff --git a/paddle/phi/kernels/truncated_gaussian_random_kernel.h b/paddle/phi/kernels/truncated_gaussian_random_kernel.h new file mode 100644 index 0000000000000..0370cc431fef9 --- /dev/null +++ b/paddle/phi/kernels/truncated_gaussian_random_kernel.h @@ -0,0 +1,168 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/phi/common/scalar_array.h" +#include "paddle/phi/core/dense_tensor.h" +#include "paddle/phi/core/device_context.h" + +namespace phi { + +// reference: https://gist.github.com/lakshayg/d80172fe5ae3c5d2c2aedb53c250320e +template +T Erfinv(T x) { + if (x < -1 || x > 1) { + return std::numeric_limits::quiet_NaN(); + } else if (x == 1.0) { + return std::numeric_limits::infinity(); + } else if (x == -1.0) { + return -std::numeric_limits::infinity(); + } + + const T LN2 = 6.931471805599453094172321214581e-1; + + const T A0 = 1.1975323115670912564578e0; + const T A1 = 4.7072688112383978012285e1; + const T A2 = 6.9706266534389598238465e2; + const T A3 = 4.8548868893843886794648e3; + const T A4 = 1.6235862515167575384252e4; + const T A5 = 2.3782041382114385731252e4; + const T A6 = 1.1819493347062294404278e4; + const T A7 = 8.8709406962545514830200e2; + + const T B0 = 1.0000000000000000000e0; + const T B1 = 4.2313330701600911252e1; + const T B2 = 6.8718700749205790830e2; + const T B3 = 5.3941960214247511077e3; + const T B4 = 2.1213794301586595867e4; + const T B5 = 3.9307895800092710610e4; + const T B6 = 2.8729085735721942674e4; + const T B7 = 5.2264952788528545610e3; + + const T C0 = 1.42343711074968357734e0; + const T C1 = 4.63033784615654529590e0; + const T C2 = 5.76949722146069140550e0; + const T C3 = 3.64784832476320460504e0; + const T C4 = 1.27045825245236838258e0; + const T C5 = 2.41780725177450611770e-1; + const T C6 = 2.27238449892691845833e-2; + const T C7 = 7.74545014278341407640e-4; + + const T D0 = 1.4142135623730950488016887e0; + const T D1 = 2.9036514445419946173133295e0; + const T D2 = 2.3707661626024532365971225e0; + const T D3 = 9.7547832001787427186894837e-1; + const T D4 = 2.0945065210512749128288442e-1; + const T D5 = 2.1494160384252876777097297e-2; + const T D6 = 7.7441459065157709165577218e-4; + const T D7 = 1.4859850019840355905497876e-9; + + const T E0 = 6.65790464350110377720e0; + const T E1 = 5.46378491116411436990e0; + const T E2 = 1.78482653991729133580e0; + const T E3 = 2.96560571828504891230e-1; + const T E4 = 2.65321895265761230930e-2; + const T E5 = 1.24266094738807843860e-3; + const T E6 = 2.71155556874348757815e-5; + const T E7 = 2.01033439929228813265e-7; + + const T F0 = 1.414213562373095048801689e0; + const T F1 = 8.482908416595164588112026e-1; + const T F2 = 1.936480946950659106176712e-1; + const T F3 = 2.103693768272068968719679e-2; + const T F4 = 1.112800997078859844711555e-3; + const T F5 = 2.611088405080593625138020e-5; + const T F6 = 2.010321207683943062279931e-7; + const T F7 = 2.891024605872965461538222e-15; + + T abs_x = abs(x); + + if (abs_x <= 0.85) { + T r = 0.180625 - 0.25 * x * x; + T num = + (((((((A7 * r + A6) * r + A5) * r + A4) * r + A3) * r + A2) * r + A1) * + r + + A0); + T den = + (((((((B7 * r + B6) * r + B5) * r + B4) * r + B3) * r + B2) * r + B1) * + r + + B0); + return x * num / den; + } + + T r = sqrt(LN2 - log(1.0 - abs_x)); + + T num, den; + if (r <= 5.0) { + r = r - 1.6; + num = + (((((((C7 * r + C6) * r + C5) * r + C4) * r + C3) * r + C2) * r + C1) * + r + + C0); + den = + (((((((D7 * r + D6) * r + D5) * r + D4) * r + D3) * r + D2) * r + D1) * + r + + D0); + } else { + r = r - 5.0; + num = + (((((((E7 * r + E6) * r + E5) * r + E4) * r + E3) * r + E2) * r + E1) * + r + + E0); + den = + (((((((F7 * r + F6) * r + F5) * r + F4) * r + F3) * r + F2) * r + F1) * + r + + F0); + } + + if (x < 0) { + return -num / den; + } else { + return num / den; + } +} + +template +struct TruncatedNormal { + T mean, std; + T a_normal_cdf; + T b_normal_cdf; + TruncatedNormal(T mean, T std) : mean(mean), std(std) { + auto normal_cdf = [](T x) { + return (1.0 + std::erf(x / std::sqrt(2.0))) / 2.0; + }; + a_normal_cdf = normal_cdf(-2.0); + b_normal_cdf = normal_cdf(2.0); + } + + T operator()(T value) const { + auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value; + return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean; + } +}; + +template +void TruncatedGaussianRandomKernel(const Context& ctx, + const ScalarArray& shape, + float mean, + float std, + int seed, + DataType dtype, + DenseTensor* out); + +} // namespace phi diff --git a/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc new file mode 100644 index 0000000000000..3c4d47f8c7221 --- /dev/null +++ b/paddle/phi/ops/compat/truncated_gaussian_random_sig.cc @@ -0,0 +1,30 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature TruncatedGaussianRandomOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("truncated_gaussian_random", + {}, + {"shape", "mean", "std", "seed", "dtype"}, + {"Out"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(truncated_gaussian_random, + phi::TruncatedGaussianRandomOpArgumentMapping);