From 8744ba7295c255b9a10d78d3c6da565c733a9fce Mon Sep 17 00:00:00 2001 From: liutiexing Date: Wed, 22 Sep 2021 20:09:22 +0000 Subject: [PATCH 1/7] add align for WorkQueue --- .../framework/new_executor/CMakeLists.txt | 2 +- .../framework/new_executor/event_count.h | 5 +- .../new_executor/nonblocking_threadpool.h | 16 ++--- .../fluid/framework/new_executor/run_queue.h | 2 +- .../fluid/framework/new_executor/workqueue.cc | 16 +++-- .../framework/new_executor/workqueue_utils.cc | 59 +++++++++++++++++++ .../framework/new_executor/workqueue_utils.h | 4 ++ 7 files changed, 88 insertions(+), 16 deletions(-) create mode 100644 paddle/fluid/framework/new_executor/workqueue_utils.cc diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt index f66d743620b45..365083a34782a 100644 --- a/paddle/fluid/framework/new_executor/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/CMakeLists.txt @@ -2,7 +2,7 @@ set(INTERPRETERCORE_DEPS op_registry device_context scope framework_proto data_f lod_rank_table fs shell fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer feed_fetch_method graph_to_program_pass variable_helper timer monitor) -cc_library(workqueue SRCS workqueue.cc DEPS enforce) +cc_library(workqueue SRCS workqueue.cc workqueue_utils.cc DEPS enforce) cc_library(interpretercore_garbage_collector SRCS interpretercore_garbage_collector.cc DEPS workqueue ${DEVICE_EVENT_LIBS}) cc_library(interpretercore_util SRCS interpretercore_util.cc DEPS ${INTERPRETERCORE_DEPS} workqueue) cc_library(event_manager SRCS event_manager.cc DEPS ${DEVICE_EVENT_LIBS} glog) diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h index f374456ca3814..0c6d49042d22d 100644 --- a/paddle/fluid/framework/new_executor/event_count.h +++ b/paddle/fluid/framework/new_executor/event_count.h @@ -50,6 +50,7 @@ #include #include #include +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" namespace paddle { namespace framework { @@ -60,7 +61,7 @@ class EventCount { explicit EventCount(size_t waiter_num) : state_(kStackMask) { assert(waiter_num < (1 << kWaiterBits) - 1); - void* buffer = malloc(sizeof(Waiter) * waiter_num); + void* buffer = AlignedMalloc(sizeof(Waiter) * waiter_num, alignof(Waiter)); if (buffer == nullptr) { return; } @@ -78,7 +79,7 @@ class EventCount { ~EventCount() { // Ensure there are no waiters. assert(state_.load() == kStackMask); - free(waiters_); + AlignedFree(waiters_); } Waiter* GetWaiter(size_t waiter_index) { diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h index 56edcecd17f37..56a036527a56b 100644 --- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h @@ -56,9 +56,9 @@ class TaskTracker { } private: - std::atomic num_tasks_{0}; - EventCount wait_empty_cv_; - std::atomic wait_empty_{false}; + alignas(64) std::atomic num_tasks_{0}; + alignas(64) EventCount wait_empty_cv_; + alignas(64) std::atomic wait_empty_{false}; }; template @@ -70,15 +70,15 @@ class ThreadPoolTempl { ThreadPoolTempl(int num_threads, bool allow_spinning, Environment env = Environment()) : env_(env), - num_threads_(num_threads), allow_spinning_(allow_spinning), - thread_data_(num_threads), global_steal_partition_(EncodePartition(0, num_threads_)), blocked_(0), spinning_(0), done_(false), cancelled_(false), - ec_(num_threads_) { + ec_(num_threads), + num_threads_(num_threads), + thread_data_(num_threads) { // Calculate coprimes of all numbers [1, num_threads]. // Coprimes are used for random walks over all threads in Steal // and NonEmptyQueueIndex. Iteration is based on the fact that if we take @@ -259,9 +259,7 @@ class ThreadPoolTempl { }; Environment env_; - const int num_threads_; const bool allow_spinning_; - std::vector thread_data_; std::vector> all_coprimes_; unsigned global_steal_partition_; std::atomic blocked_; @@ -269,6 +267,8 @@ class ThreadPoolTempl { std::atomic done_; std::atomic cancelled_; EventCount ec_; + const int num_threads_; + std::vector thread_data_; // Main worker thread loop. void WorkerLoop(int thread_id) { diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h index 707aadd315885..13035237ff8b4 100644 --- a/paddle/fluid/framework/new_executor/run_queue.h +++ b/paddle/fluid/framework/new_executor/run_queue.h @@ -204,7 +204,6 @@ class RunQueue { kReady, }; - std::mutex mutex_; // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of // front/back, respectively. The remaining bits contain modification counters // that are incremented on Push operations. This allows us to (1) distinguish @@ -214,6 +213,7 @@ class RunQueue { // modification counters. alignas(64) std::atomic front_; alignas(64) std::atomic back_; + std::mutex mutex_; Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index 184d9d6998464..a9dfddd689669 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -18,14 +18,18 @@ class WorkQueueImpl : public WorkQueue { explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options), queue_(nullptr), tracker_(nullptr) { if (options_.track_task) { - tracker_ = new TaskTracker; + void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); + tracker_ = new (storage) TaskTracker; } queue_ = new NonblockingThreadPool(options_.num_threads, options_.allow_spinning); } virtual ~WorkQueueImpl() { - delete tracker_; + if (tracker_ != nullptr) { + tracker_->~TaskTracker(); + AlignedFree(tracker_); + } delete queue_; } @@ -89,7 +93,8 @@ WorkQueueGroupImpl::WorkQueueGroupImpl( for (size_t idx = 0; idx < num_queues; ++idx) { const auto& options = queues_options_[idx]; if (options.track_task && tracker_ == nullptr) { - tracker_ = new TaskTracker; + void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); + tracker_ = new (storage) TaskTracker; } queues_[idx] = new (&queues_storage_[idx]) NonblockingThreadPool(options.num_threads, options.allow_spinning); @@ -100,7 +105,10 @@ WorkQueueGroupImpl::~WorkQueueGroupImpl() { for (auto queue : queues_) { queue->~NonblockingThreadPool(); } - delete tracker_; + if (tracker_ != nullptr) { + tracker_->~TaskTracker(); + AlignedFree(tracker_); + } free(queues_storage_); } diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc new file mode 100644 index 0000000000000..2ea49e676a807 --- /dev/null +++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" +#include +#include + +namespace paddle { +namespace framework { + +void* AlignedMalloc(size_t size, size_t alignment) { + assert(alignment >= sizeof(void*) && (alignment & (alignment - 1)) == 0); + size = (size + alignment - 1) / alignment * alignment; +#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L + void* aligned_mem = nullptr; + if (posix_memalign(&aligned_mem, alignment, size) != 0) { + aligned_mem = nullptr; + } + return aligned_mem; +#elif defined(_WIN32) + return _aligned_malloc(size, alignment); +#else + void* mem = malloc(size + alignment); + if (mem == nullptr) { + return nullptr; + } + size_t adjust = alignment - reinterpret_cast(mem) % alignment; + void* aligned_mem = reinterpret_cast(mem) + adjust; + *(reinterpret_cast(aligned_mem) - 1) = mem; + assert(reinterpret_cast(aligned_mem) % alignment == 0); + return aligned_mem; +#endif +} + +void AlignedFree(void* mem_ptr) { +#if defined(_POSIX_C_SOURCE) && _POSIX_C_SOURCE >= 200112L + free(mem_ptr); +#elif defined(_WIN32) + _aligned_free(mem_ptr); +#else + if (mem_ptr) { + free(*(reinterpret_cast(mem_ptr) - 1)); + } +#endif +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index 00183eadcbb5c..6907f2f17da0d 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -59,5 +59,9 @@ class CounterGuard { Holder* counter_holder_{nullptr}; }; +void* AlignedMalloc(size_t size, size_t alignment); + +void AlignedFree(void* memory_ptr); + } // namespace framework } // namespace paddle From 6f00ace9b1b5c5d45b5843472b5eeb4ce1602509 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Thu, 23 Sep 2021 13:44:55 +0000 Subject: [PATCH 2/7] add spinlock --- .../fluid/framework/new_executor/run_queue.h | 9 +++-- .../fluid/framework/new_executor/workqueue.cc | 4 +- .../framework/new_executor/workqueue_utils.h | 40 +++++++++++++++++++ 3 files changed, 47 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h index 13035237ff8b4..1d32eb05d4d51 100644 --- a/paddle/fluid/framework/new_executor/run_queue.h +++ b/paddle/fluid/framework/new_executor/run_queue.h @@ -37,6 +37,7 @@ #include #include #include +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" namespace paddle { namespace framework { @@ -101,7 +102,7 @@ class RunQueue { // PushBack adds w at the end of the queue. // If queue is full returns w, otherwise returns default-constructed Work. Work PushBack(Work w) { - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[(back - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -123,7 +124,7 @@ class RunQueue { return Work(); } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[back & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -145,7 +146,7 @@ class RunQueue { return 0; } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); unsigned size = Size(); unsigned mid = back; @@ -213,7 +214,7 @@ class RunQueue { // modification counters. alignas(64) std::atomic front_; alignas(64) std::atomic back_; - std::mutex mutex_; + SpinLock mutex_; Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index bc5a4e27dc528..8c6eeab4d5c0a 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -166,7 +166,7 @@ std::unique_ptr CreateMultiThreadedWorkQueue( "WorkQueueOptions.num_threads must be " "greater than 1.")); std::unique_ptr ptr(new WorkQueueImpl(options)); - return ptr; + return std::move(ptr); } std::unique_ptr CreateWorkQueueGroup( @@ -176,7 +176,7 @@ std::unique_ptr CreateWorkQueueGroup( "For a WorkQueueGroup, the number of WorkQueueOptions " "must be greater than 1.")); std::unique_ptr ptr(new WorkQueueGroupImpl(queues_options)); - return ptr; + return std::move(ptr); } } // namespace framework diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index 6907f2f17da0d..1aab4529b023c 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -14,9 +14,16 @@ #pragma once +#include #include #include #include +#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(__i386__) +#define __WQ_x86__ +#include +#endif +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -63,5 +70,38 @@ void* AlignedMalloc(size_t size, size_t alignment); void AlignedFree(void* memory_ptr); +static inline void CpuRelax() { +#if defined(__WQ_x86__) + _mm_pause(); +#endif +} + +class SpinLock { + public: + void lock() { + for (;;) { + if (!lock_.exchange(true, std::memory_order_acquire)) { + break; + } + constexpr int kMaxLoop = 32; + for (int loop = 1; lock_.load(std::memory_order_relaxed);) { + if (loop <= kMaxLoop) { + for (int i = 1; i <= loop; ++i) { + CpuRelax(); + } + loop *= 2; + } else { + std::this_thread::yield(); + } + } + } + } + + void unlock() { lock_.store(false, std::memory_order_release); } + + private: + std::atomic lock_{false}; +}; + } // namespace framework } // namespace paddle From 54aa3324dc5e0cc04f973488786659a2c2a90d01 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Tue, 12 Oct 2021 09:39:19 +0000 Subject: [PATCH 3/7] merge develop --- AUTHORS.md | 14 +- cmake/external/dlpack.cmake | 2 +- cmake/operators.cmake | 2 +- cmake/third_party.cmake | 4 +- log | Bin 2816 -> 0 bytes paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/channel.h | 20 +- paddle/fluid/framework/data_feed.cc | 754 ++++++++++++- paddle/fluid/framework/data_feed.h | 353 +++++- paddle/fluid/framework/data_feed_factory.cc | 5 +- paddle/fluid/framework/data_set.cc | 194 +++- paddle/fluid/framework/data_set.h | 40 +- paddle/fluid/framework/dataset_factory.cc | 3 +- paddle/fluid/framework/details/CMakeLists.txt | 2 +- .../fluid/framework/details/build_strategy.cc | 7 +- .../fluid/framework/details/build_strategy.h | 2 + .../details/computation_op_handle.cc | 8 +- .../fast_threaded_ssa_graph_executor.cc | 8 +- .../details/fused_all_reduce_op_handle.cc | 85 ++ .../details/fused_all_reduce_op_handle.h | 7 + .../details/scale_loss_grad_op_handle.cc | 21 +- .../details/scale_loss_grad_op_handle.h | 6 + .../scope_buffered_ssa_graph_executor.cc | 53 +- .../scope_buffered_ssa_graph_executor.h | 2 +- paddle/fluid/framework/device_worker.h | 8 - .../framework/distributed_strategy.proto | 1 + paddle/fluid/framework/dlpack_tensor.cc | 80 +- paddle/fluid/framework/dlpack_tensor.h | 2 +- paddle/fluid/framework/dlpack_tensor_test.cc | 29 +- paddle/fluid/framework/fleet/fleet_wrapper.cc | 14 + paddle/fluid/framework/fleet/fleet_wrapper.h | 1 + paddle/fluid/framework/fleet/gloo_wrapper.cc | 45 +- paddle/fluid/framework/fleet/gloo_wrapper.h | 20 + .../fluid/framework/fleet/ps_gpu_wrapper.cc | 100 +- paddle/fluid/framework/fleet/ps_gpu_wrapper.h | 10 +- paddle/fluid/framework/ir/CMakeLists.txt | 2 + paddle/fluid/framework/ir/generate_pass.cc | 110 ++ paddle/fluid/framework/ir/generate_pass.h | 153 ++- .../framework/ir/generate_pass_tester.cc | 267 +---- .../framework/ir/graph_pattern_detector.cc | 37 +- paddle/fluid/framework/ir/is_test_pass.cc | 2 +- .../multi_devices_graph_pass/CMakeLists.txt | 2 +- .../modify_op_lock_and_record_event_pass.cc | 14 +- .../fluid/framework/ir/paddle_to_cinn_pass.cc | 31 + .../fluid/framework/ir/paddle_to_cinn_pass.h | 30 + .../framework/ir/paddle_to_cinn_pass_test.cc | 40 + .../ir/quant_conv2d_dequant_fuse_pass.cc | 3 +- .../framework/new_executor/interpretercore.cc | 102 +- .../framework/new_executor/interpretercore.h | 10 +- .../new_executor/interpretercore_util.cc | 29 +- .../new_executor/interpretercore_util.h | 9 +- .../new_executor/new_executor_defs.h | 9 +- .../fluid/framework/new_executor/run_queue.h | 9 +- paddle/fluid/framework/op_desc.h | 2 +- paddle/fluid/framework/operator.cc | 17 +- .../fluid/framework/operator_kernel_configs.h | 2 + .../framework/paddle2cinn/CMakeLists.txt | 7 + .../framework/paddle2cinn/cinn_cache_key.cc | 87 ++ .../framework/paddle2cinn/cinn_cache_key.h | 63 ++ .../paddle2cinn/cinn_cache_key_test.cc | 101 ++ .../paddle2cinn/cinn_compiled_object.cc | 50 + .../paddle2cinn/cinn_compiled_object.h | 50 + .../paddle2cinn/cinn_compiled_object_test.cc | 41 + .../framework/paddle2cinn/cinn_runner.cc | 61 + .../fluid/framework/paddle2cinn/cinn_runner.h | 65 ++ .../framework/paddle2cinn/cinn_runner_test.cc | 44 + paddle/fluid/framework/parallel_executor.cc | 179 +++ paddle/fluid/framework/parallel_executor.h | 7 + paddle/fluid/framework/ps_gpu_trainer.cc | 45 +- paddle/fluid/framework/ps_gpu_worker.cc | 34 +- paddle/fluid/framework/tensor.cc | 6 +- paddle/fluid/framework/tensor_util.cc | 21 +- paddle/fluid/framework/trainer.h | 8 +- paddle/fluid/framework/var_desc.h | 2 +- .../fluid/imperative/partial_grad_engine.cc | 10 +- paddle/fluid/imperative/variable_wrapper.h | 1 + .../fluid/inference/api/analysis_predictor.cc | 1 + .../inference/tensorrt/convert/CMakeLists.txt | 1 + .../inference/tensorrt/convert/mish_op.cc | 74 ++ .../inference/tensorrt/convert/pool2d_op.cc | 27 + .../tensorrt/convert/test_mish_op.cc | 47 + .../inference/tensorrt/convert/yolo_box_op.cc | 9 +- paddle/fluid/inference/tensorrt/op_teller.cc | 90 +- .../inference/tensorrt/plugin/CMakeLists.txt | 1 + .../tensorrt/plugin/elementwise_op_plugin.cu | 6 - .../tensorrt/plugin/mish_op_plugin.cu | 235 ++++ .../tensorrt/plugin/mish_op_plugin.h | 175 +++ .../tensorrt/plugin/yolo_box_op_plugin.cu | 69 +- .../tensorrt/plugin/yolo_box_op_plugin.h | 3 + paddle/fluid/memory/allocation/CMakeLists.txt | 6 +- .../memory/allocation/aligned_allocator.cc | 1 + .../memory/allocation/allocator_facade.cc | 181 ++- .../memory/allocation/allocator_facade.h | 8 + .../auto_growth_best_fit_allocator.cc | 23 +- .../auto_growth_best_fit_allocator.h | 3 +- .../auto_growth_best_fit_allocator_test.cc | 14 +- paddle/fluid/memory/allocation/spin_lock.h | 43 +- paddle/fluid/operators/CMakeLists.txt | 6 +- .../fluid/operators/arg_min_max_op_base.cu.h | 21 +- paddle/fluid/operators/batch_norm_op_npu.cc | 62 +- paddle/fluid/operators/cast_op.cu | 65 +- paddle/fluid/operators/concat_op.cc | 18 +- paddle/fluid/operators/conv_cudnn_helper.h | 3 + paddle/fluid/operators/conv_op_npu.cc | 5 - .../fluid/operators/detection/CMakeLists.txt | 7 +- .../operators/detection/box_coder_op_npu.cc | 373 ++++++ paddle/fluid/operators/dropout_impl.cu.h | 26 +- paddle/fluid/operators/dropout_impl_util.h | 53 + paddle/fluid/operators/dropout_op.cc | 13 + paddle/fluid/operators/eig_op.cc | 168 +++ paddle/fluid/operators/eig_op.h | 330 ++++++ .../elementwise/elementwise_mul_op_npu.cc | 132 ++- .../elementwise/elementwise_sub_op_npu.cc | 4 +- .../fluid/operators/fill_any_like_op_npu.cc | 12 +- paddle/fluid/operators/fill_diagonal_op.cc | 18 +- paddle/fluid/operators/fill_diagonal_op.cu | 16 +- paddle/fluid/operators/flatten_op.cc | 65 +- paddle/fluid/operators/fused/CMakeLists.txt | 1 + .../operators/fused/cudnn_bn_add_relu_test.cc | 814 +++++++++++++ .../fused/cudnn_bn_stats_finalize.cu.h | 181 +++ .../operators/fused/cudnn_fusion_helper.h | 65 +- .../operators/fused/cudnn_norm_conv.cu.h | 357 ++++-- .../operators/fused/cudnn_norm_conv_test.cc | 500 +++++--- .../fused/cudnn_scale_bias_add_relu.cu.h | 292 +++++ .../operators/fused/fused_dropout_helper.h | 282 +++++ paddle/fluid/operators/group_norm_op_npu.cc | 306 +++++ paddle/fluid/operators/index_select_op_npu.cc | 17 +- paddle/fluid/operators/interpolate_v2_op.cu | 16 +- .../kernel_primitives/functor_primitives.h | 230 ++++ .../kernel_primitives/kernel_primitives.h | 1 + paddle/fluid/operators/math/matrix_solve.h | 40 + paddle/fluid/operators/matmul_v2_op_npu.cc | 477 +++++--- .../operators/mkldnn/activation_mkldnn_op.cc | 3 +- paddle/fluid/operators/mkldnn/axpy_handler.cc | 1 - .../operators/mkldnn/concat_mkldnn_op.cc | 71 ++ .../fluid/operators/mkldnn/conv_mkldnn_op.cc | 669 ++++------- .../operators/mkldnn/matmul_v2_mkldnn_op.cc | 8 +- .../operators/mkldnn/reshape_mkldnn_op.cc | 311 +++-- paddle/fluid/operators/npu_op_runner.cc | 8 + .../operators/optimizers/lars_momentum_op.cu | 391 +++++-- paddle/fluid/operators/psroi_pool_op.cc | 22 +- paddle/fluid/operators/psroi_pool_op.cu | 105 +- paddle/fluid/operators/psroi_pool_op.h | 103 +- .../fluid/operators/reduce_ops/reduce_op.cu.h | 13 +- paddle/fluid/operators/reshape_op.cc | 44 +- paddle/fluid/operators/run_program_op.h | 26 +- paddle/fluid/operators/scale_op_npu.cc | 5 +- paddle/fluid/operators/seed_op.cc | 18 + paddle/fluid/operators/seed_op.cu | 30 +- paddle/fluid/operators/seed_op.h | 1 + paddle/fluid/operators/set_value_op_npu.cc | 464 +++----- paddle/fluid/operators/slice_op_npu.cc | 66 +- .../softmax_with_cross_entropy_op.cc | 17 +- paddle/fluid/operators/sparse_attention_op.cc | 193 ++++ paddle/fluid/operators/sparse_attention_op.cu | 537 +++++++++ paddle/fluid/operators/spectral_op.cu | 5 +- paddle/fluid/operators/squeeze_op.cc | 56 +- paddle/fluid/operators/svd_helper.h | 66 ++ paddle/fluid/operators/top_k_op_npu.cc | 4 +- paddle/fluid/operators/transpose_op_npu.cc | 21 +- paddle/fluid/operators/unique_op.h | 5 +- paddle/fluid/operators/unstack_op.h | 2 +- paddle/fluid/platform/CMakeLists.txt | 5 + paddle/fluid/platform/collective_helper.cc | 8 +- paddle/fluid/platform/cuda_graph.cc | 104 ++ paddle/fluid/platform/cuda_graph.h | 142 +++ .../platform/cuda_graph_with_memory_pool.cc | 48 + .../platform/cuda_graph_with_memory_pool.h | 64 ++ paddle/fluid/platform/dynload/cusparse.cc | 4 + paddle/fluid/platform/dynload/cusparse.h | 20 +- paddle/fluid/platform/enforce.h | 14 + paddle/fluid/platform/enforce_test.cc | 22 +- paddle/fluid/platform/external_error.proto | 1 + paddle/fluid/platform/flags.cc | 44 + paddle/fluid/platform/gpu_info.cc | 46 + paddle/fluid/platform/gpu_info.h | 3 + paddle/fluid/platform/mkldnn_helper.h | 6 + paddle/fluid/platform/mkldnn_reuse.h | 568 +--------- paddle/fluid/platform/type_defs.h | 3 + paddle/fluid/pybind/CMakeLists.txt | 2 +- paddle/fluid/pybind/data_set_py.cc | 2 - paddle/fluid/pybind/fleet_wrapper_py.cc | 2 + paddle/fluid/pybind/ir.cc | 10 +- paddle/fluid/pybind/op_function_generator.cc | 10 +- paddle/fluid/pybind/pybind.cc | 86 +- paddle/scripts/paddle_build.bat | 53 +- paddle/scripts/paddle_build.sh | 1 - paddle/testing/paddle_gtest_main.cc | 25 +- python/paddle/__init__.py | 13 +- python/paddle/autograd/__init__.py | 1 + python/paddle/autograd/functional.py | 515 +++++++++ python/paddle/autograd/utils.py | 49 + python/paddle/device/cuda/__init__.py | 127 +++ python/paddle/device/cuda/graphs.py | 57 + .../distributed/auto_parallel/__init__.py | 2 + .../distributed/auto_parallel/completion.py | 170 +++ .../distributed/auto_parallel/context.py | 3 + .../auto_parallel/operators/dist_embedding.py | 14 +- .../distributed/auto_parallel/parallelizer.py | 9 +- .../distributed/auto_parallel/reshard.py | 1002 +++++++++++++++++ .../dygraph_optimizer/__init__.py | 1 + .../hybrid_parallel_optimizer.py | 76 +- .../meta_optimizers/raw_program_optimizer.py | 2 + .../sharding/offload_helper.py | 227 +++- .../fleet/meta_optimizers/sharding/utils.py | 68 +- .../meta_optimizers/sharding_optimizer.py | 141 ++- .../parallel_layers/mp_layers.py | 8 +- .../parallel_layers/pp_layers.py | 7 + python/paddle/distributed/fleet/utils/fs.py | 45 +- python/paddle/distributed/utils.py | 3 - python/paddle/fft.py | 61 + python/paddle/fluid/backward.py | 9 +- python/paddle/fluid/clip.py | 8 +- python/paddle/fluid/contrib/sparsity/utils.py | 13 +- python/paddle/fluid/dataset.py | 2 + python/paddle/fluid/dygraph/base.py | 2 +- python/paddle/fluid/executor.py | 28 +- python/paddle/fluid/framework.py | 102 +- .../fluid/incubate/fleet/base/role_maker.py | 15 +- .../fleet/parameter_server/pslib/__init__.py | 29 +- .../pslib/optimizer_factory.py | 2 +- .../paddle/fluid/incubate/fleet/utils/hdfs.py | 4 +- python/paddle/fluid/io.py | 3 +- python/paddle/fluid/ir.py | 10 +- python/paddle/fluid/layers/nn.py | 2 + python/paddle/fluid/optimizer.py | 8 +- .../fluid/tests/unittests/CMakeLists.txt | 21 + .../tests/unittests/autograd/CMakeLists.txt | 10 + .../tests/unittests/autograd/test_hessian.py | 140 +++ .../tests/unittests/autograd/test_jacobian.py | 162 +++ .../tests/unittests/autograd/test_vjp_jvp.py | 294 +++++ .../fluid/tests/unittests/autograd/utils.py | 107 ++ .../fluid/tests/unittests/dist_transformer.py | 2 +- .../fluid/tests/unittests/fft/test_fft.py | 242 ++-- .../fluid/tests/unittests/hdfs_test_utils.py | 9 + .../unittests/hybrid_parallel_pp_alexnet.py | 17 +- .../unittests/hybrid_parallel_pp_clip_grad.py | 35 + .../hybrid_parallel_sharding_model.py | 19 +- .../ir/inference/test_trt_activation_pass.py | 36 + .../test_trt_convert_conv2d_transpose.py | 14 +- .../test_trt_convert_depthwise_conv2d.py | 11 +- ..._trt_convert_depthwise_conv2d_transpose.py | 12 +- .../inference/test_trt_convert_elementwise.py | 135 ++- .../test_trt_convert_emb_eltwise_layernorm.py | 12 + .../inference/test_trt_convert_group_norm.py | 26 +- .../ir/inference/test_trt_convert_mish.py | 174 +++ .../test_trt_convert_multihead_matmul.py | 31 +- .../ir/inference/test_trt_convert_pool2d.py | 30 +- .../ir/inference/test_trt_yolo_box_op.py | 51 + .../unittests/ir/test_ir_generate_pass.py | 3 + .../ir/test_ir_subgraph_python_interface.py | 96 ++ .../mkldnn/test_activation_bf16_mkldnn_op.py | 44 +- .../mkldnn/test_concat_bf16_mkldnn_op.py | 27 +- .../unittests/mkldnn/test_concat_mkldnn_op.py | 114 +- .../mkldnn/test_flatten_mkldnn_op.py | 151 +++ .../mkldnn/test_matmul_v2_mkldnn_op.py | 21 +- .../fluid/tests/unittests/npu/CMakeLists.txt | 1 + .../unittests/npu/test_batch_norm_op_npu.py | 54 +- .../unittests/npu/test_box_coder_op_npu.py | 252 +++++ .../npu/test_elementwise_mul_op_npu.py | 274 +++-- .../npu/test_elementwise_sub_op_npu.py | 5 + .../npu/test_fill_any_like_op_npu.py | 6 + .../unittests/npu/test_group_norm_op_npu.py | 217 ++++ .../unittests/npu/test_matmulv2_op_npu.py | 504 +++++---- .../unittests/npu/test_set_value_op_npu.py | 334 +++--- .../tests/unittests/npu/test_slice_op_npu.py | 290 +++++ .../tests/unittests/npu/test_top_k_op_npu.py | 36 + .../unittests/npu/test_transpose_op_npu.py | 98 +- .../paddle/fluid/tests/unittests/op_test.py | 90 +- .../fluid/tests/unittests/test_adamw_op.py | 166 ++- .../unittests/test_auto_parallel_reshard.py | 287 +++++ .../test_auto_parallel_reshard_dpmppp.py | 173 +++ .../test_auto_parallel_reshard_mppp.py | 231 ++++ .../test_auto_parallel_reshard_serial.py | 184 +++ .../tests/unittests/test_batch_norm_op.py | 37 +- .../fluid/tests/unittests/test_cast_op.py | 38 +- .../fluid/tests/unittests/test_concat_op.py | 2 +- .../fluid/tests/unittests/test_conv2d_op.py | 3 +- .../unittests/test_conv2d_transpose_op.py | 2 +- .../unittests/test_cross_entropy_loss.py | 50 + .../test_cuda_device_name_capability.py | 55 + .../fluid/tests/unittests/test_cuda_graph.py | 147 +++ .../unittests/test_dist_mnist_fleetapi.py | 6 +- .../fluid/tests/unittests/test_dropout_op.py | 69 ++ .../fluid/tests/unittests/test_eig_op.py | 250 ++++ .../test_fleet_hybrid_meta_optimizer.py | 90 +- .../test_fleet_sharding_meta_optimizer.py | 60 +- .../unittests/test_get_device_properties.py | 70 ++ .../fluid/tests/unittests/test_hdfs2.py | 1 + .../fluid/tests/unittests/test_linalg_cond.py | 16 +- .../tests/unittests/test_multi_dot_op.py | 18 +- ...test_parallel_dygraph_pipeline_parallel.py | 3 + .../test_parallel_executor_run_cinn.py | 56 + .../tests/unittests/test_psroi_pool_op.py | 300 ++++- .../fluid/tests/unittests/test_rnn_dp.py | 157 +++ .../tests/unittests/test_run_program_op.py | 48 + .../fluid/tests/unittests/test_seed_op.py | 4 +- .../unittests/test_sparse_attention_op.py | 318 ++++++ .../unittests/test_tensor_fill_diagonal_.py | 30 + .../fluid/tests/unittests/test_tensordot.py | 238 ++++ .../white_list/op_threshold_white_list.py | 1 + python/paddle/fluid/transpiler/collective.py | 267 ++++- python/paddle/linalg.py | 2 + python/paddle/nn/functional/__init__.py | 3 + python/paddle/nn/functional/loss.py | 55 +- .../paddle/nn/functional/sparse_attention.py | 144 +++ python/paddle/optimizer/adamw.py | 8 +- python/paddle/static/__init__.py | 2 + python/paddle/tensor/__init__.py | 5 + python/paddle/tensor/fft.py | 66 +- python/paddle/tensor/linalg.py | 223 ++-- python/paddle/tensor/manipulation.py | 208 ++++ python/paddle/tests/test_dlpack.py | 41 + python/paddle/tests/test_ops_roi_align.py | 108 ++ python/paddle/tests/test_ops_roi_pool.py | 109 ++ python/paddle/utils/dlpack.py | 18 +- python/paddle/utils/install_check.py | 58 +- python/paddle/vision/ops.py | 399 ++++++- tools/check_file_diff_approvals.sh | 2 +- .../dockerfile/build_scripts/install_cudnn.sh | 8 + tools/dockerfile/build_scripts/install_trt.sh | 5 + tools/dockerfile/centos7_manylinux.sh | 22 + tools/externalError/README.md | 30 +- tools/externalError/spider.py | 29 +- tools/externalError/start.sh | 2 +- tools/get_pr_ut.py | 1 + 326 files changed, 22129 insertions(+), 4059 deletions(-) delete mode 100644 log create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.cc create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass.h create mode 100644 paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/CMakeLists.txt create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner.cc create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner.h create mode 100644 paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/mish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/convert/test_mish_op.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu create mode 100644 paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h create mode 100644 paddle/fluid/operators/detection/box_coder_op_npu.cc create mode 100644 paddle/fluid/operators/dropout_impl_util.h create mode 100644 paddle/fluid/operators/eig_op.cc create mode 100644 paddle/fluid/operators/eig_op.h create mode 100644 paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc create mode 100644 paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h create mode 100644 paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h create mode 100644 paddle/fluid/operators/fused/fused_dropout_helper.h create mode 100644 paddle/fluid/operators/group_norm_op_npu.cc create mode 100644 paddle/fluid/operators/kernel_primitives/functor_primitives.h create mode 100644 paddle/fluid/operators/sparse_attention_op.cc create mode 100644 paddle/fluid/operators/sparse_attention_op.cu create mode 100644 paddle/fluid/platform/cuda_graph.cc create mode 100644 paddle/fluid/platform/cuda_graph.h create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.cc create mode 100644 paddle/fluid/platform/cuda_graph_with_memory_pool.h create mode 100644 python/paddle/autograd/functional.py create mode 100644 python/paddle/autograd/utils.py create mode 100644 python/paddle/device/cuda/graphs.py create mode 100644 python/paddle/distributed/auto_parallel/reshard.py create mode 100644 python/paddle/fft.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_hessian.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_jacobian.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py create mode 100644 python/paddle/fluid/tests/unittests/autograd/utils.py create mode 100644 python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py create mode 100644 python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py create mode 100644 python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py create mode 100644 python/paddle/fluid/tests/unittests/test_cuda_graph.py create mode 100644 python/paddle/fluid/tests/unittests/test_eig_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_get_device_properties.py create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py create mode 100644 python/paddle/fluid/tests/unittests/test_rnn_dp.py create mode 100644 python/paddle/fluid/tests/unittests/test_sparse_attention_op.py create mode 100644 python/paddle/fluid/tests/unittests/test_tensordot.py create mode 100644 python/paddle/nn/functional/sparse_attention.py create mode 100644 python/paddle/tests/test_ops_roi_align.py create mode 100644 python/paddle/tests/test_ops_roi_pool.py diff --git a/AUTHORS.md b/AUTHORS.md index 1eaaff2977143..60f5b424abb7a 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -3,7 +3,7 @@ | abhinavarora | Abhinav Arora | | andreazanetti | Andrea Zanetti | | arlesniak | Artur Lesniak | -| arogowie-intel | Adam Osewski | +| [arogowie-intel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Adam Osewski | | backyes | Yan-Fei Wang | | baiyfbupt | Yi-Fan Bai | | beckett1124 | Bin Qi | @@ -25,8 +25,8 @@ | hedaoyuan | Dao-Yuan He | | helinwang | He-Lin Wang | | jacquesqiao | Long-Fei Qiao | -| jakpiase | Jakub Piasecki | -| [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja | +| [jakpiase](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jakub Piasecki | +| [jczaja](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jacek Czaja | | JiayiFeng | Jia-Yi Feng | | kbinias | Krzysztof Binias | | kexinzhao | Ke-Xin Zhao | @@ -47,7 +47,8 @@ | pakchoi | Chuan-Jiang Song | | panyx0718 | Xin Pan | | pengli09 | Peng Li | -| pmajchrzak |Piotr Majchrzak | +| [piotrekobiIntel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Paturej | +| [pmajchrzak](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Majchrzak | | pkuyym | Ya-Ming Yang | | pzelazko-intel | Pawel Zelazko | | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Pawel Piotrowicz | @@ -55,12 +56,13 @@ | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus | -| [sfraczek](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Sylwester Fraczek | +| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek | | sneaxiy | Jin-Le Zeng | | Superjom | Chun-Wei Yan | | tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | +| [tsocha](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Tomasz Socha | | typhoonzero | Yi Wu | | velconia | Qi-Yang Min | | wanghaoshuang | Hao-Shuang Wang | @@ -68,7 +70,7 @@ | wangzhen-nlp | Zhen Wang | | wen-bo-yang | Wen-Bo Yang | | wojtuss | Wojciech Uss | -| wozna | Joanna Wozna | +| [wozna](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Joanna Wozna | | wwhu | Wei-Wei Hu | | xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 87db181d953af..43ffde7599226 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -18,7 +18,7 @@ set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack) set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack) set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git) -set(DLPACK_TAG v0.2) +set(DLPACK_TAG v0.4) cache_third_party(extern_dlpack REPOSITORY ${DLPACK_REPOSITORY} diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2c010a1e6297f..7541b234ceaa6 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -214,7 +214,7 @@ function(op_library TARGET) foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" -"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" +"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op" "fused_bn_add_activation_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 44463f29923b2..b3260ba27b072 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -251,8 +251,8 @@ if(WITH_GPU) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() - set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4) # download file externalErrorMsg.tar.gz + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) # download file externalErrorMsg.tar.gz if(WITH_TESTING) # copy externalErrorMsg.pb, just for unittest can get error message correctly. set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) diff --git a/log b/log deleted file mode 100644 index c02e10686b5fbcc78a8591519aaa3389dac63a56..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2816 zcmds3U2oeq6y39b#f5-jGORjC+XWlj1V|&dE9Sb{>LA+_L5s9agdzo!vg5V=@4b}l zN{;O`8L$DHUnG;_<+Ngtf*xLf+~HQ<9(CS(yv%TQ-B~Xgn9-=ByB4rcm7iO!M^x;DvU^LS8x# z69JN=1(`9`kGkC#o$oI1L5vXT%rf49mYUvfFFgkLHd1G%K4-PLAKV&v)Y#kbcK8{! z+vbjhgD@uyWWTFcn z1IDRkxmX%|Lr9v+94c8q9w%^olEAa437kCtyr19J{l`Bw0D3)`92JanC61=5l>BHD z2uJyi;#$)RPk-L&d69=b0WbZk5E_BN7#;cg{@U0jvmCu@xNCM_vFs*n!zru{^D@s@ zw6HRlUM>O~_no5!L*L!O<7b^-rkHc^?$=>D8vTMIDPc$E0*RD*Hm>+9%88O02{#@1 zEUv*}7U-GO0_sNs8&(Lp405!D*}x|2Z)segL5}fICTQGVxiAM8q z?Z?rzHJB&0!7AObf5RW^)_Z`tK`XlvP3ZWnLQ?~HCvsy~C=cgS(dm z)ri-&-iA3c%o{k^VW)ur12U#VsmJ*PM~`O-JWuYa_R)_xep*BZ6SSnw W5& p, size_t size) { // NOLINT + if (size == 0) { + return 0; + } + std::unique_lock lock(mutex_); + p.resize(size); + size_t finished = Read(size, &p[0], lock, true); + p.resize(finished); + Notify(); + return finished; + } size_t ReadAll(std::vector& p) { // NOLINT p.clear(); size_t finished = 0; @@ -241,17 +253,21 @@ class ChannelObject { return !closed_; } - size_t Read(size_t n, T* p, std::unique_lock& lock) { // NOLINT + size_t Read(size_t n, T* p, std::unique_lock& lock, // NOLINT + bool once = false) { // NOLINT size_t finished = 0; CHECK(n <= MaxCapacity() - reading_count_); reading_count_ += n; while (finished < n && WaitForRead(lock)) { - size_t m = std::min(n - finished, data_.size()); + size_t m = (std::min)(n - finished, data_.size()); for (size_t i = 0; i < m; i++) { p[finished++] = std::move(data_.front()); data_.pop_front(); } reading_count_ -= m; + if (once && m > 0) { + break; + } } reading_count_ -= n - finished; return finished; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index fdb24ee18eca7..2d089b4721b82 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" USE_INT_STAT(STAT_total_feasign_num_in_mem); +DECLARE_bool(enable_ins_parser_file); namespace paddle { namespace framework { @@ -36,6 +37,107 @@ DLManager& global_dlmanager_pool() { return manager; } +class BufferedLineFileReader { + typedef std::function SampleFunc; + static const int MAX_FILE_BUFF_SIZE = 4 * 1024 * 1024; + class FILEReader { + public: + explicit FILEReader(FILE* fp) : fp_(fp) {} + int read(char* buf, int len) { return fread(buf, sizeof(char), len, fp_); } + + private: + FILE* fp_; + }; + + public: + typedef std::function LineFunc; + + private: + template + int read_lines(T* reader, LineFunc func, int skip_lines) { + int lines = 0; + size_t ret = 0; + char* ptr = NULL; + char* eol = NULL; + total_len_ = 0; + error_line_ = 0; + + SampleFunc spfunc = get_sample_func(); + std::string x; + while (!is_error() && (ret = reader->read(buff_, MAX_FILE_BUFF_SIZE)) > 0) { + total_len_ += ret; + ptr = buff_; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + while (eol != NULL) { + int size = static_cast((eol - ptr) + 1); + x.append(ptr, size - 1); + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + + x.clear(); + ptr += size; + ret -= size; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + } + if (ret > 0) { + x.append(ptr, ret); + } + } + if (!is_error() && !x.empty()) { + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + } + return lines; + } + + public: + BufferedLineFileReader() + : random_engine_(std::random_device()()), + uniform_distribution_(0.0f, 1.0f) { + total_len_ = 0; + sample_line_ = 0; + buff_ = + reinterpret_cast(calloc(MAX_FILE_BUFF_SIZE + 1, sizeof(char))); + } + ~BufferedLineFileReader() { free(buff_); } + + int read_file(FILE* fp, LineFunc func, int skip_lines) { + FILEReader reader(fp); + return read_lines(&reader, func, skip_lines); + } + uint64_t file_size(void) { return total_len_; } + void set_sample_rate(float r) { sample_rate_ = r; } + size_t get_sample_line() { return sample_line_; } + bool is_error(void) { return (error_line_ > 10); } + + private: + SampleFunc get_sample_func() { + if (std::abs(sample_rate_ - 1.0f) < 1e-5f) { + return [this](void) { return true; }; + } + return [this](void) { + return (uniform_distribution_(random_engine_) < sample_rate_); + }; + } + + private: + char* buff_ = nullptr; + uint64_t total_len_ = 0; + + std::default_random_engine random_engine_; + std::uniform_real_distribution uniform_distribution_; + float sample_rate_ = 1.0f; + size_t sample_line_ = 0; + size_t error_line_ = 0; +}; void RecordCandidateList::ReSize(size_t length) { mutex_.lock(); capacity_ = length; @@ -301,7 +403,7 @@ int InMemoryDataFeed::Next() { << ", thread_id=" << thread_id_; } } else { - VLOG(3) << "enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size(); if (offset_index_ >= batch_offsets_.size()) { VLOG(3) << "offset_index: " << offset_index_ @@ -318,14 +420,7 @@ int InMemoryDataFeed::Next() { VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" << thread_id_; } - /* - if (offset_index_ == batch_offsets_.size() - 1) { - std::vector data; - output_channel_->ReadAll(data); - consume_channel_->Write(std::move(data)); - } - */ - VLOG(3) << "#15 enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size() << " baych_size: " << this->batch_size_; } @@ -1835,5 +1930,646 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector& ins_vec) { #endif } +template class InMemoryDataFeed; +void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + platform::errors::PreconditionNotMet( + "Multi_slot_desc has not been set in data_feed_desc")); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + + all_slots_.resize(all_slot_num); + all_slots_info_.resize(all_slot_num); + used_slots_info_.resize(all_slot_num); + use_slot_size_ = 0; + use_slots_.clear(); + + float_total_dims_size_ = 0; + float_total_dims_without_inductives_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + + AllSlotInfo& all_slot = all_slots_info_[i]; + all_slot.slot = slot.name(); + all_slot.type = slot.type(); + all_slot.used_idx = slot.is_used() ? use_slot_size_ : -1; + all_slot.slot_value_idx = -1; + + if (slot.is_used()) { + UsedSlotInfo& info = used_slots_info_[use_slot_size_]; + info.idx = i; + info.slot = slot.name(); + info.type = slot.type(); + info.dense = slot.is_dense(); + info.total_dims_without_inductive = 1; + info.inductive_shape_index = -1; + + // record float value and uint64_t value pos + if (info.type[0] == 'u') { + info.slot_value_idx = uint64_use_slot_size_; + all_slot.slot_value_idx = uint64_use_slot_size_; + ++uint64_use_slot_size_; + } else if (info.type[0] == 'f') { + info.slot_value_idx = float_use_slot_size_; + all_slot.slot_value_idx = float_use_slot_size_; + ++float_use_slot_size_; + } + + use_slots_.push_back(slot.name()); + + if (slot.is_dense()) { + for (int j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) > 0) { + info.total_dims_without_inductive *= slot.shape(j); + } + if (slot.shape(j) == -1) { + info.inductive_shape_index = j; + } + } + } + if (info.type[0] == 'f') { + float_total_dims_without_inductives_.push_back( + info.total_dims_without_inductive); + float_total_dims_size_ += info.total_dims_without_inductive; + } + info.local_shape.clear(); + for (int j = 0; j < slot.shape_size(); ++j) { + info.local_shape.push_back(slot.shape(j)); + } + ++use_slot_size_; + } + } + used_slots_info_.resize(use_slot_size_); + + feed_vec_.resize(used_slots_info_.size()); + const int kEstimatedFeasignNumPerSlot = 5; // Magic Number + for (size_t i = 0; i < all_slot_num; i++) { + batch_float_feasigns_.push_back(std::vector()); + batch_uint64_feasigns_.push_back(std::vector()); + batch_float_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + batch_uint64_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + offset_.push_back(std::vector()); + offset_[i].reserve(default_batch_size_ + + 1); // Each lod info will prepend a zero + } + visit_.resize(all_slot_num, false); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; + input_type_ = data_feed_desc.input_type(); + size_t pos = pipe_command_.find(".so"); + if (pos != std::string::npos) { + pos = pipe_command_.rfind('|'); + if (pos == std::string::npos) { + so_parser_name_ = pipe_command_; + pipe_command_.clear(); + } else { + so_parser_name_ = pipe_command_.substr(pos + 1); + pipe_command_ = pipe_command_.substr(0, pos); + } + so_parser_name_ = paddle::string::erase_spaces(so_parser_name_); + } else { + so_parser_name_.clear(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemory() { + VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_; + if (!so_parser_name_.empty()) { + LoadIntoMemoryByLib(); + } else { + LoadIntoMemoryByCommand(); + } +} +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLib(void) { + if (true) { + // user defined file format analysis + LoadIntoMemoryByFile(); + } else { + LoadIntoMemoryByLine(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByFile(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + CHECK(parser != nullptr); + // get slotrecord object + auto pull_record_func = [this](std::vector& record_vec, + int max_fetch_num, int offset) { + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec[0], offset); + } else { // free all + max_fetch_num = static_cast(record_vec.size()); + if (max_fetch_num > offset) { + SlotRecordPool().put(&record_vec[offset], (max_fetch_num - offset)); + } + } + } else if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec, max_fetch_num); + } else { + SlotRecordPool().put(&record_vec); + } + }; + + std::string filename; + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + platform::Timer timeline; + timeline.Start(); + + int lines = 0; + bool is_ok = true; + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + is_ok = parser->ParseFileInstance( + [this](char* buf, int len) { + return fread(buf, sizeof(char), len, this->fp_.get()); + }, + pull_record_func, lines); + + if (!is_ok) { + LOG(WARNING) << "parser error, filename=" << filename + << ", lines=" << lines; + } + } while (!is_ok); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all file, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines; + } +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + BufferedLineFileReader::LineFunc line_func = nullptr; + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + int offset = 0; + int old_offset = 0; + + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + // get slotrecord object function + auto record_func = [this, &offset, &record_vec, &old_offset]( + std::vector& vec, int num) { + vec.resize(num); + if (offset + num > OBJPOOL_BLOCK_SIZE) { + input_channel_->WriteMove(offset, &record_vec[0]); + SlotRecordPool().get(&record_vec[0], offset); + record_vec.resize(OBJPOOL_BLOCK_SIZE); + offset = 0; + old_offset = 0; + } + for (int i = 0; i < num; ++i) { + auto& ins = record_vec[offset + i]; + ins->reset(); + vec[i] = ins; + } + offset = offset + num; + }; + + line_func = [this, &parser, &record_vec, &offset, &filename, &record_func, + &old_offset](const std::string& line) { + old_offset = offset; + if (!parser->ParseOneInstance(line, record_func)) { + offset = old_offset; + LOG(WARNING) << "read file:[" << filename << "] item error, line:[" + << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }; + + int lines = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + lines = line_reader.read_file(this->fp_.get(), line_func, lines); + } while (line_reader.is_error()); + + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", filesize=" << line_reader.file_size() / 1024.0 / 1024.0 + << "MB"; + } + + VLOG(3) << "LoadIntoMemoryByLib() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByCommand(void) { +#ifdef _LINUX + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int lines = 0; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + int offset = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + + lines = line_reader.read_file( + this->fp_.get(), + [this, &record_vec, &offset, &filename](const std::string& line) { + if (ParseOneInstance(line, &record_vec[offset])) { + ++offset; + } else { + LOG(WARNING) << "read file:[" << filename + << "] item error, line:[" << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }, + lines); + } while (line_reader.is_error()); + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename + << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + } + VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +static void parser_log_key(const std::string& log_key, uint64_t* search_id, + uint32_t* cmatch, uint32_t* rank) { + std::string searchid_str = log_key.substr(16, 16); + *search_id = static_cast(strtoull(searchid_str.c_str(), NULL, 16)); + std::string cmatch_str = log_key.substr(11, 3); + *cmatch = static_cast(strtoul(cmatch_str.c_str(), NULL, 16)); + std::string rank_str = log_key.substr(14, 2); + *rank = static_cast(strtoul(rank_str.c_str(), NULL, 16)); +} + +bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line, + SlotRecord* ins) { + SlotRecord& rec = (*ins); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + + thread_local std::vector> slot_float_feasigns; + thread_local std::vector> slot_uint64_feasigns; + slot_float_feasigns.resize(float_use_slot_size_); + slot_uint64_feasigns.resize(uint64_use_slot_size_); + + if (parse_ins_id_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + rec->ins_id_ = std::string(str + pos, len); + pos += len + 1; + } + if (parse_logkey_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + // parse_logkey + std::string log_key = std::string(str + pos, len); + uint64_t search_id; + uint32_t cmatch; + uint32_t rank; + parser_log_key(log_key, &search_id, &cmatch, &rank); + + rec->ins_id_ = log_key; + rec->search_id = search_id; + rec->cmatch = cmatch; + rec->rank = rank; + pos += len + 1; + } + + int float_total_slot_num = 0; + int uint64_total_slot_num = 0; + + for (size_t i = 0; i < all_slots_info_.size(); ++i) { + auto& info = all_slots_info_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE(num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (info.used_idx != -1) { + if (info.type[0] == 'f') { // float + auto& slot_fea = slot_float_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + if (fabs(feasign) < 1e-6 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++float_total_slot_num; + } + } else if (info.type[0] == 'u') { // uint64 + auto& slot_fea = slot_uint64_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + uint64_t feasign = + static_cast(strtoull(endptr, &endptr, 10)); + if (feasign == 0 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++uint64_total_slot_num; + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + rec->slot_float_feasigns_.add_slot_feasigns(slot_float_feasigns, + float_total_slot_num); + rec->slot_uint64_feasigns_.add_slot_feasigns(slot_uint64_feasigns, + uint64_total_slot_num); + + return (uint64_total_slot_num > 0); +} + +void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec, + int num) { + for (int j = 0; j < use_slot_size_; ++j) { + auto& feed = feed_vec_[j]; + if (feed == nullptr) { + continue; + } + + auto& slot_offset = offset_[j]; + slot_offset.clear(); + slot_offset.reserve(num + 1); + slot_offset.push_back(0); + + int total_instance = 0; + auto& info = used_slots_info_[j]; + // fill slot value with default value 0 + if (info.type[0] == 'f') { // float + auto& batch_fea = batch_float_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + float* slot_values = + r->slot_float_feasigns_.get_values(info.slot_value_idx, &fea_num); + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(float) * fea_num); + total_instance += fea_num; + slot_offset.push_back(total_instance); + } + + float* feasign = batch_fea.data(); + float* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float)); + + } else if (info.type[0] == 'u') { // uint64 + auto& batch_fea = batch_uint64_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + uint64_t* slot_values = + r->slot_uint64_feasigns_.get_values(info.slot_value_idx, &fea_num); + if (fea_num > 0) { + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(uint64_t) * fea_num); + total_instance += fea_num; + } + if (fea_num == 0) { + batch_fea.resize(total_instance + fea_num); + batch_fea[total_instance] = 0; + total_instance += 1; + } + slot_offset.push_back(total_instance); + } + + // no uint64_t type in paddlepaddle + uint64_t* feasign = batch_fea.data(); + int64_t* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + } + + if (info.dense) { + if (info.inductive_shape_index != -1) { + info.local_shape[info.inductive_shape_index] = + total_instance / info.total_dims_without_inductive; + } + feed->Resize(framework::make_ddim(info.local_shape)); + } else { + LoD data_lod{slot_offset}; + feed_vec_[j]->set_lod(data_lod); + } + } +} + +void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) { + SlotRecord& ins = (*rec); + if (ins->slot_float_feasigns_.slot_offsets.empty()) { + return; + } + size_t total_value_size = ins->slot_float_feasigns_.slot_values.size(); + if (float_total_dims_size_ == total_value_size) { + return; + } + int float_slot_num = + static_cast(float_total_dims_without_inductives_.size()); + CHECK(float_slot_num == float_use_slot_size_); + std::vector old_values; + std::vector old_offsets; + old_values.swap(ins->slot_float_feasigns_.slot_values); + old_offsets.swap(ins->slot_float_feasigns_.slot_offsets); + + ins->slot_float_feasigns_.slot_values.resize(float_total_dims_size_); + ins->slot_float_feasigns_.slot_offsets.assign(float_slot_num + 1, 0); + + auto& slot_offsets = ins->slot_float_feasigns_.slot_offsets; + auto& slot_values = ins->slot_float_feasigns_.slot_values; + + uint32_t offset = 0; + int num = 0; + uint32_t old_off = 0; + int dim = 0; + + for (int i = 0; i < float_slot_num; ++i) { + dim = float_total_dims_without_inductives_[i]; + old_off = old_offsets[i]; + num = static_cast(old_offsets[i + 1] - old_off); + if (num == 0) { + // fill slot value with default value 0 + for (int k = 0; k < dim; ++k) { + slot_values[k + offset] = 0.0; + } + } else { + if (num == dim) { + memcpy(&slot_values[offset], &old_values[old_off], dim * sizeof(float)); + } else { + // position fea + // record position index need fix values + int pos_idx = static_cast(old_values[old_off]); + for (int k = 0; k < dim; ++k) { + if (k == pos_idx) { + slot_values[k + offset] = 1.0; + } else { + slot_values[k + offset] = 0.0; + } + } + } + } + slot_offsets[i] = offset; + offset += dim; + } + slot_offsets[float_slot_num] = offset; + CHECK(float_total_dims_size_ == static_cast(offset)); +} + +bool SlotRecordInMemoryDataFeed::Start() { +#ifdef _LINUX + this->CheckSetFileList(); + if (input_channel_->Size() != 0) { + std::vector data; + input_channel_->Read(data); + } +#endif + if (batch_offsets_.size() > 0) { + VLOG(3) << "batch_size offsets: " << batch_offsets_.size(); + enable_heterps_ = true; + this->offset_index_ = 0; + } + this->finish_start_ = true; + return true; +} + +int SlotRecordInMemoryDataFeed::Next() { +#ifdef _LINUX + this->CheckStart(); + + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + if (offset_index_ >= batch_offsets_.size()) { + VLOG(3) << "offset_index: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + return 0; + } + auto& batch = batch_offsets_[offset_index_++]; + this->batch_size_ = batch.second; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(&records_[batch.first], this->batch_size_); + } else { + VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" + << thread_id_; + } + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size() + << " baych_size: " << this->batch_size_; + + return this->batch_size_; +#else + return 0; +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 198bc51463af3..a4100e66e7285 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -39,8 +39,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" +DECLARE_int32(record_pool_max_size); +DECLARE_int32(slotpool_thread_num); +DECLARE_bool(enable_slotpool_wait_release); +DECLARE_bool(enable_slotrecord_reset_shrink); + namespace paddle { namespace framework { class DataFeedDesc; @@ -69,6 +75,50 @@ namespace framework { // while (reader->Next()) { // // trainer do something // } + +template +struct SlotValues { + std::vector slot_values; + std::vector slot_offsets; + + void add_values(const T* values, uint32_t num) { + if (slot_offsets.empty()) { + slot_offsets.push_back(0); + } + if (num > 0) { + slot_values.insert(slot_values.end(), values, values + num); + } + slot_offsets.push_back(static_cast(slot_values.size())); + } + T* get_values(int idx, size_t* size) { + uint32_t& offset = slot_offsets[idx]; + (*size) = slot_offsets[idx + 1] - offset; + return &slot_values[offset]; + } + void add_slot_feasigns(const std::vector>& slot_feasigns, + uint32_t fea_num) { + slot_values.reserve(fea_num); + int slot_num = static_cast(slot_feasigns.size()); + slot_offsets.resize(slot_num + 1); + for (int i = 0; i < slot_num; ++i) { + auto& slot_val = slot_feasigns[i]; + slot_offsets[i] = static_cast(slot_values.size()); + uint32_t num = static_cast(slot_val.size()); + if (num > 0) { + slot_values.insert(slot_values.end(), slot_val.begin(), slot_val.end()); + } + } + slot_offsets[slot_num] = slot_values.size(); + } + void clear(bool shrink) { + slot_offsets.clear(); + slot_values.clear(); + if (shrink) { + slot_values.shrink_to_fit(); + slot_offsets.shrink_to_fit(); + } + } +}; union FeatureFeasign { uint64_t uint64_feasign_; float float_feasign_; @@ -97,6 +147,38 @@ struct FeatureItem { uint16_t slot_; }; +struct AllSlotInfo { + std::string slot; + std::string type; + int used_idx; + int slot_value_idx; +}; +struct UsedSlotInfo { + int idx; + int slot_value_idx; + std::string slot; + std::string type; + bool dense; + std::vector local_shape; + int total_dims_without_inductive; + int inductive_shape_index; +}; +struct SlotRecordObject { + uint64_t search_id; + uint32_t rank; + uint32_t cmatch; + std::string ins_id_; + SlotValues slot_uint64_feasigns_; + SlotValues slot_float_feasigns_; + + ~SlotRecordObject() { clear(true); } + void reset(void) { clear(FLAGS_enable_slotrecord_reset_shrink); } + void clear(bool shrink) { + slot_uint64_feasigns_.clear(shrink); + slot_float_feasigns_.clear(shrink); + } +}; +using SlotRecord = SlotRecordObject*; // sizeof Record is much less than std::vector struct Record { std::vector uint64_feasigns_; @@ -108,6 +190,179 @@ struct Record { uint32_t cmatch; }; +inline SlotRecord make_slotrecord() { + static const size_t slot_record_byte_size = sizeof(SlotRecordObject); + void* p = malloc(slot_record_byte_size); + new (p) SlotRecordObject; + return reinterpret_cast(p); +} + +inline void free_slotrecord(SlotRecordObject* p) { + p->~SlotRecordObject(); + free(p); +} + +template +class SlotObjAllocator { + public: + explicit SlotObjAllocator(std::function deleter) + : free_nodes_(NULL), capacity_(0), deleter_(deleter) {} + ~SlotObjAllocator() { clear(); } + + void clear() { + T* tmp = NULL; + while (free_nodes_ != NULL) { + tmp = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + deleter_(tmp); + --capacity_; + } + CHECK_EQ(capacity_, static_cast(0)); + } + T* acquire(void) { + T* x = NULL; + x = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + --capacity_; + return x; + } + void release(T* x) { + Node* node = reinterpret_cast(reinterpret_cast(x)); + node->next = free_nodes_; + free_nodes_ = node; + ++capacity_; + } + size_t capacity(void) { return capacity_; } + + private: + struct alignas(T) Node { + union { + Node* next; + char data[sizeof(T)]; + }; + }; + Node* free_nodes_; // a list + size_t capacity_; + std::function deleter_ = nullptr; +}; +static const int OBJPOOL_BLOCK_SIZE = 10000; +class SlotObjPool { + public: + SlotObjPool() + : max_capacity_(FLAGS_record_pool_max_size), alloc_(free_slotrecord) { + ins_chan_ = MakeChannel(); + ins_chan_->SetBlockSize(OBJPOOL_BLOCK_SIZE); + for (int i = 0; i < FLAGS_slotpool_thread_num; ++i) { + threads_.push_back(std::thread([this]() { run(); })); + } + disable_pool_ = false; + count_ = 0; + } + ~SlotObjPool() { + ins_chan_->Close(); + for (auto& t : threads_) { + t.join(); + } + } + void disable_pool(bool disable) { disable_pool_ = disable; } + void set_max_capacity(size_t max_capacity) { max_capacity_ = max_capacity; } + void get(std::vector* output, int n) { + output->resize(n); + return get(&(*output)[0], n); + } + void get(SlotRecord* output, int n) { + int size = 0; + mutex_.lock(); + int left = static_cast(alloc_.capacity()); + if (left > 0) { + size = (left >= n) ? n : left; + for (int i = 0; i < size; ++i) { + output[i] = alloc_.acquire(); + } + } + mutex_.unlock(); + count_ += n; + if (size == n) { + return; + } + for (int i = size; i < n; ++i) { + output[i] = make_slotrecord(); + } + } + void put(std::vector* input) { + size_t size = input->size(); + if (size == 0) { + return; + } + put(&(*input)[0], size); + input->clear(); + } + void put(SlotRecord* input, size_t size) { + CHECK(ins_chan_->WriteMove(size, input) == size); + } + void run(void) { + std::vector input; + while (ins_chan_->ReadOnce(input, OBJPOOL_BLOCK_SIZE)) { + if (input.empty()) { + continue; + } + // over max capacity + size_t n = input.size(); + count_ -= n; + if (disable_pool_ || n + capacity() > max_capacity_) { + for (auto& t : input) { + free_slotrecord(t); + } + } else { + for (auto& t : input) { + t->reset(); + } + mutex_.lock(); + for (auto& t : input) { + alloc_.release(t); + } + mutex_.unlock(); + } + input.clear(); + } + } + void clear(void) { + platform::Timer timeline; + timeline.Start(); + mutex_.lock(); + alloc_.clear(); + mutex_.unlock(); + // wait release channel data + if (FLAGS_enable_slotpool_wait_release) { + while (!ins_chan_->Empty()) { + sleep(1); + } + } + timeline.Pause(); + VLOG(3) << "clear slot pool data size=" << count_.load() + << ", span=" << timeline.ElapsedSec(); + } + size_t capacity(void) { + mutex_.lock(); + size_t total = alloc_.capacity(); + mutex_.unlock(); + return total; + } + + private: + size_t max_capacity_; + Channel ins_chan_; + std::vector threads_; + std::mutex mutex_; + SlotObjAllocator alloc_; + bool disable_pool_; + std::atomic count_; // NOLINT +}; + +inline SlotObjPool& SlotRecordPool() { + static SlotObjPool pool; + return pool; +} struct PvInstanceObject { std::vector ads; void merge_instance(Record* ins) { ads.push_back(ins); } @@ -129,7 +384,21 @@ class CustomParser { CustomParser() {} virtual ~CustomParser() {} virtual void Init(const std::vector& slots) = 0; + virtual bool Init(const std::vector& slots); virtual void ParseOneInstance(const char* str, Record* instance) = 0; + virtual bool ParseOneInstance( + const std::string& line, + std::function&, int)> + GetInsFunc) { // NOLINT + return true; + } + virtual bool ParseFileInstance( + std::function ReadBuffFunc, + std::function&, int, int)> + PullRecordsFunc, // NOLINT + int& lines) { // NOLINT + return false; + } }; typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)(); @@ -194,6 +463,34 @@ class DLManager { return nullptr; } + paddle::framework::CustomParser* Load(const std::string& name, + const std::vector& conf) { +#ifdef _LINUX + std::lock_guard lock(mutex_); + DLHandle handle; + std::map::iterator it = handle_map_.find(name); + if (it != handle_map_.end()) { + return it->second.parser; + } + handle.module = dlopen(name.c_str(), RTLD_NOW); + if (handle.module == nullptr) { + VLOG(0) << "Create so of " << name << " fail"; + exit(-1); + return nullptr; + } + + CreateParserObjectFunc create_parser_func = + (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject"); + handle.parser = create_parser_func(); + handle.parser->Init(conf); + handle_map_.insert({name, handle}); + + return handle.parser; +#endif + VLOG(0) << "Not implement in windows"; + return nullptr; + } + paddle::framework::CustomParser* ReLoad(const std::string& name, const std::vector& conf) { Close(name); @@ -415,6 +712,11 @@ class InMemoryDataFeed : public DataFeed { virtual void SetCurrentPhase(int current_phase); virtual void LoadIntoMemory(); virtual void LoadIntoMemoryFromSo(); + virtual void SetRecord(T* records) { records_ = records; } + int GetDefaultBatchSize() { return default_batch_size_; } + void AddBatchOffset(const std::pair& offset) { + batch_offsets_.push_back(offset); + } protected: virtual bool ParseOneInstance(T* instance) = 0; @@ -424,6 +726,11 @@ class InMemoryDataFeed : public DataFeed { virtual void PutToFeedVec(const std::vector& ins_vec) = 0; virtual void PutToFeedVec(const T* ins_vec, int num) = 0; + std::vector> batch_float_feasigns_; + std::vector> batch_uint64_feasigns_; + std::vector> offset_; + std::vector visit_; + int thread_id_; int thread_num_; bool parse_ins_id_; @@ -783,11 +1090,7 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { MultiSlotInMemoryDataFeed() {} virtual ~MultiSlotInMemoryDataFeed() {} virtual void Init(const DataFeedDesc& data_feed_desc); - void SetRecord(Record* records) { records_ = records; } - int GetDefaultBatchSize() { return default_batch_size_; } - void AddBatchOffset(const std::pair& offset) { - batch_offsets_.push_back(offset); - } + // void SetRecord(Record* records) { records_ = records; } protected: virtual bool ParseOneInstance(Record* instance); @@ -798,10 +1101,42 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id, uint32_t* cmatch, uint32_t* rank); virtual void PutToFeedVec(const Record* ins_vec, int num); - std::vector> batch_float_feasigns_; - std::vector> batch_uint64_feasigns_; - std::vector> offset_; - std::vector visit_; +}; + +class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { + public: + SlotRecordInMemoryDataFeed() {} + virtual ~SlotRecordInMemoryDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc); + virtual void LoadIntoMemory(); + void ExpandSlotRecord(SlotRecord* ins); + + protected: + virtual bool Start(); + virtual int Next(); + virtual bool ParseOneInstance(SlotRecord* instance) { return false; } + virtual bool ParseOneInstanceFromPipe(SlotRecord* instance) { return false; } + // virtual void ParseOneInstanceFromSo(const char* str, T* instance, + // CustomParser* parser) {} + virtual void PutToFeedVec(const std::vector& ins_vec) {} + + virtual void LoadIntoMemoryByCommand(void); + virtual void LoadIntoMemoryByLib(void); + virtual void LoadIntoMemoryByLine(void); + virtual void LoadIntoMemoryByFile(void); + virtual void SetInputChannel(void* channel) { + input_channel_ = static_cast*>(channel); + } + bool ParseOneInstance(const std::string& line, SlotRecord* rec); + virtual void PutToFeedVec(const SlotRecord* ins_vec, int num); + float sample_rate_ = 1.0f; + int use_slot_size_ = 0; + int float_use_slot_size_ = 0; + int uint64_use_slot_size_ = 0; + std::vector all_slots_info_; + std::vector used_slots_info_; + size_t float_total_dims_size_ = 0; + std::vector float_total_dims_without_inductives_; }; class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index ec1b8ec773fa6..e46e4aeb0124c 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -58,8 +58,8 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( std::string data_feed_class) { if (g_data_feed_map.count(data_feed_class) < 1) { LOG(WARNING) << "Your DataFeed " << data_feed_class - << "is not supported currently"; - LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList(); + << " is not supported currently"; + LOG(WARNING) << " Supported DataFeed: " << DataFeedTypeList(); exit(-1); } return g_data_feed_map[data_feed_class](); @@ -68,6 +68,7 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); +REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 08c42a93d1fcb..2a071665b263c 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -351,10 +351,8 @@ static int compute_thread_batch_nccl( return thread_avg_batch_num; } -template -void DatasetImpl::SetHeterPs(bool enable_heterps) { +void MultiSlotDataset::PrepareTrain() { #ifdef PADDLE_WITH_GLOO - enable_heterps_ = enable_heterps; if (enable_heterps_) { if (input_records_.size() == 0 && input_channel_ != nullptr && input_channel_->Size() != 0) { @@ -541,22 +539,21 @@ void DatasetImpl::LocalShuffle() { << timeline.ElapsedSec() << " seconds"; } -template -void DatasetImpl::GlobalShuffle(int thread_num) { +void MultiSlotDataset::GlobalShuffle(int thread_num) { #ifdef PADDLE_WITH_PSLIB - VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() begin"; platform::Timer timeline; timeline.Start(); auto fleet_ptr = FleetWrapper::GetInstance(); if (!input_channel_ || input_channel_->Size() == 0) { - VLOG(3) << "DatasetImpl::GlobalShuffle() end, no data to shuffle"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() end, no data to shuffle"; return; } // local shuffle input_channel_->Close(); - std::vector data; + std::vector data; input_channel_->ReadAll(data); std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine()); input_channel_->Open(); @@ -566,10 +563,10 @@ void DatasetImpl::GlobalShuffle(int thread_num) { input_channel_->Close(); input_channel_->SetBlockSize(fleet_send_batch_size_); - VLOG(3) << "DatasetImpl::GlobalShuffle() input_channel_ size " + VLOG(3) << "MultiSlotDataset::GlobalShuffle() input_channel_ size " << input_channel_->Size(); - auto get_client_id = [this, fleet_ptr](const T& data) -> size_t { + auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t { if (!this->merge_by_insid_) { return fleet_ptr->LocalRandomEngine()() % this->trainer_num_; } else { @@ -580,7 +577,7 @@ void DatasetImpl::GlobalShuffle(int thread_num) { auto global_shuffle_func = [this, get_client_id]() { auto fleet_ptr = FleetWrapper::GetInstance(); - std::vector data; + std::vector data; while (this->input_channel_->Read(data)) { std::vector ars(this->trainer_num_); for (auto& t : data) { @@ -835,9 +832,6 @@ void DatasetImpl::CreateReaders() { channel_idx = 0; } } - if (enable_heterps_) { - SetHeterPs(true); - } VLOG(3) << "readers size: " << readers_.size(); } @@ -923,9 +917,8 @@ int64_t DatasetImpl::GetShuffleDataSize() { return sum; } -template -int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, - const std::string& msg) { +int MultiSlotDataset::ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) { #ifdef _LINUX VLOG(3) << "ReceiveFromClient msg_type=" << msg_type << ", client_id=" << client_id << ", msg length=" << msg.length(); @@ -937,9 +930,9 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, if (ar.Cursor() == ar.Finish()) { return 0; } - std::vector data; + std::vector data; while (ar.Cursor() < ar.Finish()) { - data.push_back(ar.Get()); + data.push_back(ar.Get()); } CHECK(ar.Cursor() == ar.Finish()); @@ -966,6 +959,20 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, // explicit instantiation template class DatasetImpl; +void MultiSlotDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + void MultiSlotDataset::PostprocessInstance() { // divide pv instance, and merge to input_channel_ if (enable_pv_merge_) { @@ -1503,5 +1510,154 @@ void MultiSlotDataset::SlotsShuffle( << ", cost time=" << timeline.ElapsedSec() << " seconds"; } +template class DatasetImpl; +void SlotRecordDataset::CreateChannel() { + if (input_channel_ == nullptr) { + input_channel_ = paddle::framework::MakeChannel(); + } +} +void SlotRecordDataset::CreateReaders() { + VLOG(3) << "Calling CreateReaders()"; + VLOG(3) << "thread num in Dataset: " << thread_num_; + VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); + VLOG(3) << "channel num in Dataset: " << channel_num_; + CHECK(thread_num_ > 0) << "thread num should > 0"; + CHECK(channel_num_ > 0) << "channel num should > 0"; + CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; + VLOG(3) << "readers size: " << readers_.size(); + if (readers_.size() != 0) { + VLOG(3) << "readers_.size() = " << readers_.size() + << ", will not create again"; + return; + } + VLOG(3) << "data feed class name: " << data_feed_desc_.name(); + for (int i = 0; i < thread_num_; ++i) { + readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + readers_[i]->Init(data_feed_desc_); + readers_[i]->SetThreadId(i); + readers_[i]->SetThreadNum(thread_num_); + readers_[i]->SetFileListMutex(&mutex_for_pick_file_); + readers_[i]->SetFileListIndex(&file_idx_); + readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_); + readers_[i]->SetFeaNum(&total_fea_num_); + readers_[i]->SetFileList(filelist_); + readers_[i]->SetParseInsId(parse_ins_id_); + readers_[i]->SetParseContent(parse_content_); + readers_[i]->SetParseLogKey(parse_logkey_); + readers_[i]->SetEnablePvMerge(enable_pv_merge_); + readers_[i]->SetCurrentPhase(current_phase_); + if (input_channel_ != nullptr) { + readers_[i]->SetInputChannel(input_channel_.get()); + } + } + VLOG(3) << "readers size: " << readers_.size(); +} + +void SlotRecordDataset::ReleaseMemory() { + VLOG(3) << "SlotRecordDataset::ReleaseMemory() begin"; + platform::Timer timeline; + timeline.Start(); + + if (input_channel_) { + input_channel_->Clear(); + input_channel_ = nullptr; + } + if (enable_heterps_) { + VLOG(3) << "put pool records size: " << input_records_.size(); + SlotRecordPool().put(&input_records_); + input_records_.clear(); + input_records_.shrink_to_fit(); + VLOG(3) << "release heterps input records records size: " + << input_records_.size(); + } + + readers_.clear(); + readers_.shrink_to_fit(); + + std::vector>().swap(readers_); + + VLOG(3) << "SlotRecordDataset::ReleaseMemory() end"; + VLOG(3) << "total_feasign_num_(" << STAT_GET(STAT_total_feasign_num_in_mem) + << ") - current_fea_num_(" << total_fea_num_ << ") = (" + << STAT_GET(STAT_total_feasign_num_in_mem) - total_fea_num_ << ")" + << " object pool size=" << SlotRecordPool().capacity(); // For Debug + STAT_SUB(STAT_total_feasign_num_in_mem, total_fea_num_); +} +void SlotRecordDataset::GlobalShuffle(int thread_num) { + // TODO(yaoxuefeng) + return; +} + +void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins) { + if (channel_num_ == channel_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustChannelNum channel_num_=" + << channel_num_ << ", channel_num_=channel_num, no need to adjust"; + return; + } + VLOG(3) << "adjust channel num from " << channel_num_ << " to " + << channel_num; + channel_num_ = channel_num; + + if (static_cast(input_channel_->Size()) >= channel_num) { + input_channel_->SetBlockSize(input_channel_->Size() / channel_num + + (discard_remaining_ins ? 0 : 1)); + } + + VLOG(3) << "adjust channel num done"; +} + +void SlotRecordDataset::PrepareTrain() { +#ifdef PADDLE_WITH_GLOO + if (enable_heterps_) { + if (input_records_.size() == 0 && input_channel_ != nullptr && + input_channel_->Size() != 0) { + input_channel_->ReadAll(input_records_); + VLOG(3) << "read from channel to records with records size: " + << input_records_.size(); + } + VLOG(3) << "input records size: " << input_records_.size(); + int64_t total_ins_num = input_records_.size(); + std::vector> offset; + int default_batch_size = + reinterpret_cast(readers_[0].get()) + ->GetDefaultBatchSize(); + VLOG(3) << "thread_num: " << thread_num_ + << " memory size: " << total_ins_num + << " default batch_size: " << default_batch_size; + compute_thread_batch_nccl(thread_num_, total_ins_num, default_batch_size, + &offset); + VLOG(3) << "offset size: " << offset.size(); + for (int i = 0; i < thread_num_; i++) { + reinterpret_cast(readers_[i].get()) + ->SetRecord(&input_records_[0]); + } + for (size_t i = 0; i < offset.size(); i++) { + reinterpret_cast( + readers_[i % thread_num_].get()) + ->AddBatchOffset(offset[i]); + } + } +#else + PADDLE_THROW(platform::errors::Unavailable( + "dataset set heterps need compile with GLOO")); +#endif + return; +} + +void SlotRecordDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index f3ee96fab8297..981fb694e0fec 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -149,7 +149,6 @@ class Dataset { virtual void DynamicAdjustReadersNum(int thread_num) = 0; // set fleet send sleep seconds virtual void SetFleetSendSleepSeconds(int seconds) = 0; - virtual void SetHeterPs(bool enable_heterps) = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -207,7 +206,7 @@ class DatasetImpl : public Dataset { virtual void WaitPreLoadDone(); virtual void ReleaseMemory(); virtual void LocalShuffle(); - virtual void GlobalShuffle(int thread_num = -1); + virtual void GlobalShuffle(int thread_num = -1) {} virtual void SlotsShuffle(const std::set& slots_to_replace) {} virtual const std::vector& GetSlotsOriginalData() { return slots_shuffle_original_data_; @@ -233,7 +232,11 @@ class DatasetImpl : public Dataset { bool discard_remaining_ins = false); virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); - virtual void SetHeterPs(bool enable_heterps); + /* for enable_heterps_ + virtual void EnableHeterps(bool enable_heterps) { + enable_heterps_ = enable_heterps; + } + */ std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -251,7 +254,10 @@ class DatasetImpl : public Dataset { protected: virtual int ReceiveFromClient(int msg_type, int client_id, - const std::string& msg); + const std::string& msg) { + // TODO(yaoxuefeng) for SlotRecordDataset + return -1; + } std::vector> readers_; std::vector> preload_readers_; paddle::framework::Channel input_channel_; @@ -327,6 +333,32 @@ class MultiSlotDataset : public DatasetImpl { const std::unordered_set& slots_to_replace, std::vector* result); virtual ~MultiSlotDataset() {} + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustReadersNum(int thread_num); + virtual void PrepareTrain(); + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg); +}; +class SlotRecordDataset : public DatasetImpl { + public: + SlotRecordDataset() { SlotRecordPool(); } + virtual ~SlotRecordDataset() {} + // create input channel + virtual void CreateChannel(); + // create readers + virtual void CreateReaders(); + // release memory + virtual void ReleaseMemory(); + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins); + virtual void PrepareTrain(); + virtual void DynamicAdjustReadersNum(int thread_num); + + protected: + bool enable_heterps_ = true; }; } // end namespace framework diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc index aeaf961185323..38200927c5586 100644 --- a/paddle/fluid/framework/dataset_factory.cc +++ b/paddle/fluid/framework/dataset_factory.cc @@ -53,7 +53,7 @@ std::unique_ptr DatasetFactory::CreateDataset( std::string dataset_class) { if (g_dataset_map.count(dataset_class) < 1) { LOG(WARNING) << "Your Dataset " << dataset_class - << "is not supported currently"; + << " is not supported currently"; LOG(WARNING) << "Supported Dataset: " << DatasetTypeList(); exit(-1); } @@ -61,5 +61,6 @@ std::unique_ptr DatasetFactory::CreateDataset( } REGISTER_DATASET_CLASS(MultiSlotDataset); +REGISTER_DATASET_CLASS(SlotRecordDataset); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 72f7f0e6011c1..ad81b48847af9 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -139,7 +139,7 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass coalesce_grad_tensor_pass fuse_all_reduce_op_pass backward_optimizer_op_deps_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass - fix_op_run_order_pass) + paddle_to_cinn_pass fix_op_run_order_pass) if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 0d55882953db3..a55b809055f3e 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -19,8 +19,9 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" -DECLARE_bool(use_mkldnn); DECLARE_bool(convert_all_blocks); +DECLARE_bool(use_cinn); +DECLARE_bool(use_mkldnn); namespace paddle { namespace framework { @@ -71,6 +72,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to check whether the multi_device_graph is right. AppendPass("multi_devices_check_pass"); + // Note: This pass is used to enable cinn. + if (FLAGS_use_cinn) { + AppendPass("paddle_to_cinn_pass"); + } SetCollectiveContext(); } diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0629f1b91504a..25110fe24f587 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -143,6 +143,8 @@ struct BuildStrategy { // Turn off inplace addto by default. bool enable_addto_{false}; + bool allow_cuda_graph_capture_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2256b826ed501..60b8461668f6f 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -16,6 +16,8 @@ #include +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -31,11 +33,13 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + if (!FLAGS_allreduce_record_one_event) { + WaitInputVarGenerated(place_); + } auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); }; - if (is_lock_and_record_event_free_) { + if (is_lock_and_record_event_free_ || FLAGS_allreduce_record_one_event) { run_func(); } else { this->RunAndRecordEvent(run_func); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 120bdd2bc9f56..75998e4582e2b 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -130,10 +130,12 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( } } // Wait FetchOps. - ClearFetchOp(graph_, &fetch_ops); + if (!fetch_ops.empty()) { + ClearFetchOp(graph_, &fetch_ops); - for (auto &place : places_) { - fetch_ctxs_.Get(place)->Wait(); + for (auto &place : places_) { + fetch_ctxs_.Get(place)->Wait(); + } } return fetches; diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 8f45c364476a7..94507140a81d6 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -48,11 +50,80 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( num_of_all_reduce_(num_of_all_reduce) {} #endif +FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto destroy_event = [](gpuEvent_t event) { + if (event == nullptr) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); +#endif + }; + destroy_event(start_event_); + destroy_event(end_event_); +#endif +} + void FusedAllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); VLOG(4) << this->DebugString(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) { + VLOG(10) << "FLAGS_allreduce_record_one_event=true"; + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "The hierarchical allreduce does not support " + "FLAGS_allreduce_record_one_event=true")); + PADDLE_ENFORCE_EQ(places_.size(), 1, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using one GPU device per process.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(places_[0]), true, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using GPU device.")); + auto create_event = [](gpuEvent_t *event) { + if (*event) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(event, hipEventDisableTiming)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(event, cudaEventDisableTiming)); +#endif + }; + create_event(&start_event_); + create_event(&end_event_); + } + + gpuStream_t nccl_stream{nullptr}; + gpuStream_t compute_stream{nullptr}; + + if (FLAGS_allreduce_record_one_event) { + auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]); + compute_stream = + platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream(); + auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); + auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device); + nccl_stream = nccl_ctx.stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(nccl_stream, start_event_, 0)); +#endif + } else { + WaitInputVarGenerated(); + } +#else WaitInputVarGenerated(); +#endif + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... auto in_var_handles = DynamicCast(this->Inputs()); @@ -94,6 +165,20 @@ void FusedAllReduceOpHandle::RunImpl() { } else { FusedAllReduceFunc(in_var_handles, out_var_handles); } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(compute_stream, end_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(compute_stream, end_event_, 0)); +#endif + } +#endif } void FusedAllReduceOpHandle::FusedAllReduceFunc( diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index d22dc0a421ac0..8473700867ce3 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -67,12 +67,19 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { #endif std::string Name() const override; + ~FusedAllReduceOpHandle(); + protected: void RunImpl() override; private: size_t num_of_all_reduce_; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + gpuEvent_t start_event_{nullptr}; + gpuEvent_t end_event_{nullptr}; +#endif + // Check the dtype of the input void GetDTypeAndNumel( const std::vector> &g_tensor, diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index fcfbfd0557e25..1e3cd4f0aa77c 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -86,26 +86,35 @@ struct ScaleLossGradFunctor { } }; +std::string ScaleLossGradOpHandle::LossGradName() const { + return static_cast(this->outputs_[0])->name(); +} + void ScaleLossGradOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); - // Doesn't wait any event - std::string var_name = static_cast(this->outputs_[0])->name(); + RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true); +} - auto *tensor = - local_exec_scopes_[0]->FindVar(var_name)->GetMutable(); +void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { + auto *tensor = var->GetMutable(); tensor->Resize(make_ddim({1})); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); - this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); + if (record_event) { + this->RunAndRecordEvent( + [&] { framework::VisitDataType(out_dtype_, func); }); + } else { + framework::VisitDataType(out_dtype_, func); + } #else ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr); framework::VisitDataType(out_dtype_, func); #endif } -std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } +std::string ScaleLossGradOpHandle::Name() const { return "ScaleLossGrad"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 02e5aa88443df..88fe02a749fe4 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase { std::string Name() const override; + platform::Place GetPlace() const { return place_; } + + void RunOnVar(Variable *var, bool record_event = false); + + std::string LossGradName() const; + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index ad47846c59a05..5d271d06b6922 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -22,7 +22,9 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/profiler.h" + namespace paddle { namespace framework { namespace details { @@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( PrepareLocalExeScopes(); } +static void RunProgramDescs(const ProgramDescs &programs, + const std::vector &local_exec_scopes, + const std::vector &places) { + for (auto &program : programs) { + for (auto &op_desc : program.Block(0).AllOps()) { + for (size_t i = 0; i < local_exec_scopes.size(); ++i) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_exec_scopes[i], places[i]); + } + } + } +} + FetchResultType ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + strategy_.num_iteration_per_drop_scope_ = + std::numeric_limits::max(); + DropLocalExeScopes(/*need_wait=*/false); + } +#endif + if (drop_scope_counter_ == 0) { platform::RecordEvent e("InitLocalVars"); InitVariables(); @@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run( ++drop_scope_counter_; if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ || DropScopeOrNot()) { - DropLocalExeScopes(); + DropLocalExeScopes(!platform::IsCUDAGraphCapturing()); } if (VLOG_IS_ON(5)) { @@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kStartupProgramDescs)) { auto &program_descs = graph.Get(details::kStartupProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } is_initialized_ = true; } @@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kProgramDescs)) { auto &program_descs = graph.Get(details::kProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } } -void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { +void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) { platform::RecordEvent drop_scope_event("DropLocalExeScopes"); drop_scope_counter_ = 0; - for (auto &p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + if (need_wait) { + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } scope_monitor_.ClearHistoryLocalExecScopes(); for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index aa2b113c960a3..ea5a3c07957bf 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FetchResultType Run(const std::vector& fetch_tensors, bool return_merged) override; - void DropLocalExeScopes(); + void DropLocalExeScopes(bool need_wait = true); bool NeedCreateLocalExeScope(); diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 810e9a087d122..11beb84d74914 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -454,7 +454,6 @@ class PSGPUWorker : public HogwildWorker { virtual void Initialize(const TrainerDesc& desc); virtual void TrainFiles(); virtual void TrainFilesWithProfiler(); - virtual void SetNeedDump(bool need_dump_field); virtual void SetChannelWriter(ChannelObject* queue); virtual void SetWorkerNum(int num) { worker_num_ = num; } virtual void CacheProgram(const ProgramDesc& main_program) { @@ -467,7 +466,6 @@ class PSGPUWorker : public HogwildWorker { protected: void PushGradients(); - void DumpParam(); void CopySparseTable(); void CopyDenseTable(); void CopyDenseVars(); @@ -475,18 +473,12 @@ class PSGPUWorker : public HogwildWorker { private: int mpi_rank_; std::mutex mutex_; - std::vector send_var_list_; int worker_num_; ProgramDesc program_; HeterObjectPool object_pool_; - bool need_dump_param_; - std::vector dump_param_; bool need_to_push_dense_; - bool need_dump_field_; bool dump_slot_; bool need_to_push_sparse_; - std::vector dump_fields_; - ChannelWriter writer_; DownpourWorkerParameter param_; float scale_datanorm_; // just save the value in param_ for easy access diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 17d15a94c7287..e7a25de96a947 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -115,6 +115,7 @@ message BuildStrategy { optional bool enable_auto_fusion = 11 [ default = false ]; optional bool enable_addto = 12 [ default = false ]; optional bool fix_op_run_order = 13 [ default = false ]; + optional bool allow_cuda_graph_capture = 14 [ default = false ]; } message ExecutionStrategy { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index f1f5ba7789ea6..71b53b8a51882 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -30,14 +30,10 @@ static ::DLDataType GetDLDataTypeCode() { ::DLDataType dtype; if (std::is_same>::value || std::is_same>::value) { - // The current dlpack library version is v0.2, and does not define - // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set - // dtype.code to 5U directly here. After the dlpack library version being - // upgraded to v0.4, it should be written as follow. - // dtype.code = kDLComplex; - dtype.code = 5U; + dtype.code = kDLComplex; + } else if (std::is_same::value) { + dtype.code = kDLBfloat; } else if (std::is_same::value || - std::is_same::value || std::is_floating_point::value) { dtype.code = kDLFloat; } else if (std::is_unsigned::value) { @@ -77,47 +73,47 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { #undef REG_DL_DATA_TYPE } -struct DLContextVisitor : public boost::static_visitor<::DLContext> { - inline ::DLContext operator()(const platform::CPUPlace &place) const { - ::DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; - return ctx; +struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { + inline ::DLDevice operator()(const platform::CPUPlace &place) const { + ::DLDevice device; + device.device_type = kDLCPU; + device.device_id = 0; + return device; } - inline ::DLContext operator()(const platform::XPUPlace &place) const { + inline ::DLDevice operator()(const platform::XPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::XPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::NPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const { PADDLE_THROW(platform::errors::Unimplemented( "platform::NPUPinnedPlace is not supported")); } - inline ::DLContext operator()(const platform::CUDAPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLGPU; - ctx.device_id = place.device; - return ctx; + ::DLDevice device; + device.device_type = kDLGPU; + device.device_id = place.device; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPlace is not supported in CPU only version.")); #endif } - inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLCPUPinned; - ctx.device_id = 0; - return ctx; + ::DLDevice device; + device.device_type = kDLCPUPinned; + device.device_id = 0; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPinnedPlace is not supported in CPU only version.")); @@ -130,9 +126,9 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { // init data, data buffer t_.data = const_cast(tensor.data()); - // init ctx, DLContext type with device_type and device_id + // init device, DLDevice type with device_type and device_id auto place = tensor.place(); - t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place); // init dtype t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); @@ -156,10 +152,8 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { t_.byte_offset = 0; } -::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { - // init shape, tensor dims - // for DLManagedTensor shape need to be compatible with ndim - // refer to cupy and cudf, we new int64[ndim] +::DLManagedTensor *DLPackTensor::ToDLManagedTensor() { + // init shape auto shape = new int64_t[t_.ndim]; using DimType = decltype(t_.ndim); // int for (DimType i = 0; i < t_.ndim; ++i) { @@ -167,19 +161,15 @@ ::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { } t_.shape = shape; - // init strides, nullptr means the tensor is compact - // refer to cupy and cudf, the compact tensor first dim's strides need to be 1 - // and second dim's strides need to be length of rows of cudf - // cudf now only support dim=2 - PADDLE_ENFORCE_LE(t_.ndim, 2, platform::errors::InvalidArgument( - "cudf now only supports dimension is 2, " - "but received dimension is %d.", - t_.ndim)); - - if (t_.ndim > 1) - t_.strides = new int64_t[2]{1, t_.shape[1]}; - else - t_.strides = new int64_t[1]{1}; + // init strides + auto strides = new int64_t[t_.ndim]; + for (DimType i = 0; i < t_.ndim; ++i) { + strides[i] = 1; + } + for (DimType i = t_.ndim - 2; i >= 0; --i) { + strides[i] = t_.shape[i + 1] * strides[i + 1]; + } + t_.strides = strides; auto tensor = new DLManagedTensor; tensor->dl_tensor = t_; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index e342523718b34..03ed8884925ce 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -36,7 +36,7 @@ class DLPackTensor { inline operator ::DLTensor&() { return t_; } - ::DLManagedTensor* ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor* ToDLManagedTensor(); private: ::DLTensor t_; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 8265d105accae..4e2d7bb979b61 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -30,7 +30,11 @@ template constexpr uint8_t GetDLDataTypeCode() { if (std::is_same>::value || std::is_same>::value) { - return static_cast(5); + return static_cast(kDLComplex); + } + + if (std::is_same::value) { + return static_cast(kDLBfloat); } return std::is_same::value || @@ -55,15 +59,15 @@ void TestMain(const platform::Place &place, uint16_t lanes) { CHECK_EQ(p, dl_tensor.data); if (platform::is_cpu_place(place)) { - CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPU, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else if (platform::is_gpu_place(place)) { - CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(kDLGPU, dl_tensor.device.device_type); CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device, - dl_tensor.ctx.device_id); + dl_tensor.device.device_id); } else if (platform::is_cuda_pinned_place(place)) { - CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else { CHECK_EQ(false, true); } @@ -83,8 +87,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) { } template -void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, - uint16_t lanes) { +void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) { DDim dims{6, 7}; Tensor tensor; tensor.Resize(dims); @@ -92,8 +95,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, DLPackTensor dlpack_tensor(tensor, lanes); - ::DLManagedTensor *dl_managed_tensor = - dlpack_tensor.ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor *dl_managed_tensor = dlpack_tensor.ToDLManagedTensor(); CHECK_EQ(dl_managed_tensor->manager_ctx == nullptr, true); @@ -101,7 +103,8 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, CHECK_EQ(dims[i], dl_managed_tensor->dl_tensor.shape[i]); } - CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 1, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 7, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[1] == 1, true); dl_managed_tensor->deleter(dl_managed_tensor); } @@ -122,7 +125,7 @@ void TestMainLoop() { for (auto &p : places) { for (auto &l : lanes) { TestMain(p, l); - TestToCudfCompatibleDLManagedTensor(p, l); + TestToDLManagedTensor(p, l); } } } diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index dc5e24ef5de42..4346c144fab7f 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1347,6 +1347,20 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) { #endif } +void FleetWrapper::SetFileNumOneShard(const uint64_t table_id, int file_num) { +#ifdef PADDLE_WITH_PSLIB + auto ret = + pslib_ptr_->_worker_ptr->set_file_num_one_shard(table_id, file_num); + ret.wait(); + int32_t err_code = ret.get(); + if (err_code == -1) { + LOG(ERROR) << "set_file_num_one_shard failed"; + } +#else + VLOG(0) << "FleetWrapper::SetFileNumOneShard does nothing when no pslib"; +#endif +} + double FleetWrapper::GetCacheThreshold(int table_id) { #ifdef PADDLE_WITH_PSLIB double cache_threshold = 0.0; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index c1db06a298c86..d368b421ff2a0 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -266,6 +266,7 @@ class FleetWrapper { bool load_combine); void PrintTableStat(const uint64_t table_id); + void SetFileNumOneShard(const uint64_t table_id, int file_num); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModel(const std::string& path, const int mode); diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 489cef9f04654..14e5f2f51924b 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -71,6 +71,18 @@ void HdfsStore::set(const std::string& key, const std::vector& data) { } } paddle::framework::fs_mv(tmp, path); + auto start = std::chrono::steady_clock::now(); + while (paddle::framework::fs_exists(path) == false) { + VLOG(0) << "HdfsStore::set fs_mv retrying..."; + paddle::framework::fs_mv(tmp, path); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { + PADDLE_THROW(paddle::platform::errors::ExecutionTimeout( + "fs_mv failed, tmp: %s, path: %s", tmp, path)); + } + std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); + } #endif } @@ -140,6 +152,7 @@ void HdfsStore::wait(const std::vector& keys, auto start = std::chrono::steady_clock::now(); std::vector check_key_status(keys.size(), false); while (!Check(keys, &check_key_status)) { + VLOG(0) << "HdfsStore::wait checking repeatedly..."; auto elapsed = std::chrono::duration_cast( std::chrono::steady_clock::now() - start); if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { @@ -209,6 +222,8 @@ void ParallelConnectContext::connectFullMesh( // Create pairs auto transportContext = dev->createContext(rank, size); transportContext->setTimeout(getTimeout()); + VLOG(0) << "transportContext timeout: " << getTimeout().count() + << ", curr rank: " << rank; for (int i = 0; i < size; i++) { if (i == rank) { continue; @@ -225,6 +240,7 @@ void ParallelConnectContext::connectFullMesh( std::vector> connect_threads(thread_num_); // Connect every pair + VLOG(0) << "connect_thread_num: " << thread_num_ << ", size: " << size; for (uint32_t i = 0; i < connect_threads.size(); ++i) { connect_threads[i].reset(new std::thread( [&store, &transportContext, total_add_size, this]( @@ -252,10 +268,36 @@ void ParallelConnectContext::connectFullMesh( sleep(5); --max_retry_times; } - auto addr = extractAddress(allAddrs, i); + if (addr.empty()) { + VLOG(0) << "peer address is null"; + } + Impl impl_; + memcpy(&impl_, addr.data(), sizeof(impl_)); + struct sockaddr_in* sa = (struct sockaddr_in*)&(impl_.ss); + std::string ip = getCharIpAddr(sa->sin_addr.s_addr); + VLOG(0) << "peer " << i << " ip addr: " << ip + << ", port: " << sa->sin_port; + + auto start = std::chrono::steady_clock::now(); + std::chrono::seconds connect_wait_timeout_ = + std::chrono::seconds(600); + while (true) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (elapsed > connect_wait_timeout_) { + break; + } + try { + transportContext->getPair(i)->connect(addr); + break; + } catch (...) { + VLOG(0) << "gloo connect failed, retrying..."; + } + } transportContext->getPair(i)->connect(addr); } + VLOG(0) << "peer connected success"; }, i, connect_threads.size())); } @@ -264,6 +306,7 @@ void ParallelConnectContext::connectFullMesh( } device_ = dev; transportContext_ = std::move(transportContext); + VLOG(0) << "ParallelConnectContext::connectFullMesh() is over"; } #endif } // namespace rendezvous diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index 4eb40da1bfd39..eafc991fbca0a 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -97,6 +97,26 @@ class ParallelConnectContext : public gloo::rendezvous::Context { // slowly in case big size, especialy in HdfsStore void connectFullMesh(Store& store, // NOLINT std::shared_ptr& dev); // NOLINT + struct Impl { + // IP address of the listening socket. + struct sockaddr_storage ss; + // Sequence number of this address. + // If this is equal to -1, the address is assumed to + // represent the listening socket of a device. The sequence number + // must be set before it can be used by a pair. + ssize_t seq{-1}; + }; + std::string getCharIpAddr(uint32_t ipAddress) { + const int NBYTES = 4; + uint8_t octet[NBYTES]; + char ipAddressFinal[16]; + for (int i = 0; i < NBYTES; i++) { + octet[i] = ipAddress >> (i * 8); + } + snprintf(ipAddressFinal, sizeof(ipAddressFinal), "%d.%d.%d.%d", octet[0], + octet[1], octet[2], octet[3]); + return std::string(ipAddressFinal); + } protected: int thread_num_ = 6; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 784cbc3d90b86..d1e98a711dc9d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -45,9 +45,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { platform::Timer timeline; timeline.Start(); int device_num = heter_devices_.size(); - MultiSlotDataset* dataset = dynamic_cast(dataset_); gpu_task->init(thread_keys_shard_num_, device_num); - auto input_channel = dataset->GetInputChannel(); auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; @@ -68,35 +66,83 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { for (int i = 0; i < thread_keys_thread_num_; i++) { thread_keys_[i].resize(thread_keys_shard_num_); } - const std::deque& vec_data = input_channel->GetData(); - size_t total_len = vec_data.size(); - size_t len_per_thread = total_len / thread_keys_thread_num_; - int remain = total_len % thread_keys_thread_num_; + + size_t total_len = 0; + size_t len_per_thread = 0; + int remain = 0; size_t begin = 0; - auto gen_func = [this](const std::deque& total_data, int begin_index, - int end_index, int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins.uint64_feasigns_; - for (const auto feasign : feasign_v) { - uint64_t cur_key = feasign.sign().uint64_feasign_; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); + + std::string data_set_name = std::string(typeid(*dataset_).name()); + + if (data_set_name.find("SlotRecordDataset") != std::string::npos) { + VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset"; + SlotRecordDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + VLOG(0) << "yxf::buildtask::inputslotchannle size: " + << input_channel->Size(); + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + VLOG(0) << "total len: " << total_len; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + for (const auto feasign : feasign_v) { + int shard_id = feasign % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(feasign); + } } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - threads.push_back(std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (std::thread& t : threads) { - t.join(); + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + } else { + CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); + VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; + MultiSlotDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins.uint64_feasigns_; + for (const auto feasign : feasign_v) { + uint64_t cur_key = feasign.sign().uint64_feasign_; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } - timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; timeline.Start(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index b7e8bbb369492..fa2ff6cbdb8c7 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -117,6 +117,15 @@ class PSGPUWrapper { resource_ = std::make_shared(dev_ids); resource_->enable_p2p(); keys_tensor.resize(resource_->total_gpu()); +#ifdef PADDLE_WITH_GLOO + auto gloo = paddle::framework::GlooWrapper::GetInstance(); + if (gloo->Size() > 1) { + multi_node_ = 1; + } +#else + PADDLE_THROW( + platform::errors::Unavailable("heter ps need compile with GLOO")); +#endif if (multi_node_) { int dev_size = dev_ids.size(); // init inner comm @@ -127,7 +136,6 @@ class PSGPUWrapper { // init inter comm #ifdef PADDLE_WITH_GLOO inter_comms_.resize(dev_size); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 99c691e6cf6f7..6f5f27400752d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -59,6 +59,7 @@ cc_library(placement_pass_base SRCS placement_pass_base.cc DEPS pass) cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS graph graph_helper) pass_library(graph_to_program_pass base) +pass_library(paddle_to_cinn_pass base DEPS cinn_runner) pass_library(graph_viz_pass base) pass_library(lock_free_optimize_pass base DEPS string_helper) pass_library(fc_fuse_pass inference) @@ -142,6 +143,7 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +cc_test(paddle_to_cinn_pass_test SRCS paddle_to_cinn_pass_test.cc DEPS paddle_to_cinn_pass proto_desc) cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 9eba6fc89a2e9..085298314ea3f 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -224,6 +225,115 @@ bool GeneratePass::VerifyGraph(const Graph& graph) { return true; } +namespace generate_pass { + +VarHelper::VarHelper(const char* name) : name_(name), type_(Type::kInput) {} +VarHelper::VarHelper(const std::string& name, Type type) + : name_(name), type_(type) {} + +OpHelper::OpHelper(const char* type, SubgraphHelper* subgraph_helper) + : type_(type), subgraph_helper_(subgraph_helper) { + op_desc_ = subgraph_helper_->ProgramDesc()->mutable_blocks(0)->add_ops(); + op_desc_->set_type(type_); +} + +OpHelper::Arguments::Arguments(const char* parameter, + const VarHelper& var_helper) + : parameter_(parameter) { + var_helpers_.push_back(var_helper); +} + +OpHelper::Arguments::Arguments(const char* parameter, + std::initializer_list var_helpers) + : parameter_(parameter), var_helpers_(var_helpers) {} + +OpHelper& OpHelper::operator()(const Arguments& input) { + proto::OpDesc::Var* var = op_desc_->add_inputs(); + var->set_parameter(input.parameter_); + for (const VarHelper& var_helper : input.var_helpers_) { + var->add_arguments()->assign(var_helper.name_); + if (VarHelper::Type::kInput == var_helper.type_) { + subgraph_helper_->AddInputVar(var_helper.name_); + } + } + return *this; +} + +OpHelper& OpHelper::operator()(std::initializer_list inputs) { + for (const auto& input : inputs) { + operator()(input); + } + return *this; +} + +VarHelper OpHelper::Out(const char* name) { + std::string argument = patterns::UniqueKey(type_); + proto::OpDesc::Var* var = op_desc_->add_outputs(); + var->set_parameter(name); + var->add_arguments()->assign(argument); + return VarHelper(argument, VarHelper::Type::kOutput); +} + +proto::ProgramDesc* SubgraphHelper::ProgramDesc() { return &program_desc_; } + +const proto::ProgramDesc& SubgraphHelper::ProgramDesc() const { + return program_desc_; +} + +const std::vector& SubgraphHelper::InputVars() const { + return input_vars_; +} + +const std::vector& SubgraphHelper::OutputVars() const { + return output_vars_; +} + +void SubgraphHelper::AddInputVar(const std::string& name) { + auto iter = std::find(input_vars_.begin(), input_vars_.end(), name); + if (input_vars_.end() == iter) { + input_vars_.push_back(name); + } +} + +void SubgraphHelper::AddOutputVars(const VarHelper& var_helper) { + output_vars_.push_back(var_helper.name_); +} + +} // namespace generate_pass + +PassPairs::PassPairs(const SubgraphType& pattern, const SubgraphType& replace) { + AddPassDesc(pattern, replace); +} + +void PassPairs::AddPassDesc(const SubgraphType& pattern, + const SubgraphType& replace) { + proto::PassDesc* pass_desc = multi_pass_desc_.add_pass_descs(); + pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc()); + pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc()); + PADDLE_ENFORCE_EQ(pattern.InputVars().size(), replace.InputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression arguments is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.InputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.InputVars()[i]); + var_map->set_replace_var(replace.InputVars()[i]); + } + PADDLE_ENFORCE_EQ(pattern.OutputVars().size(), replace.OutputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression returns is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.OutputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.OutputVars()[i]); + var_map->set_replace_var(replace.OutputVars()[i]); + } +} + +const proto::MultiPassDesc& PassPairs::MultiPassDesc() const { + return multi_pass_desc_; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass.h b/paddle/fluid/framework/ir/generate_pass.h index f73173233aed3..26e5231fbc16e 100644 --- a/paddle/fluid/framework/ir/generate_pass.h +++ b/paddle/fluid/framework/ir/generate_pass.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/pass_desc.pb.h" @@ -43,6 +42,158 @@ class GeneratePass : public Pass { proto::MultiPassDesc multi_pass_desc_; }; +namespace generate_pass { + +class VarHelper; +class OpHelper; +class SubgraphHelper; + +// VarHelper is used to represent a variable node. +struct VarHelper { + enum class Type { kInput, kOutput }; + + explicit VarHelper(const char* name); + VarHelper(const std::string& name, Type type); + + std::string name_; + Type type_; +}; + +// OpHelper is used to represent a operator node. +class OpHelper { + public: + // Convert multiple inputs. + struct Arguments { + Arguments(const char* parameter, const VarHelper& var_helper); + Arguments(const char* parameter, + std::initializer_list var_helpers); + + std::string parameter_; + std::vector var_helpers_; + }; + + OpHelper(const char* type, SubgraphHelper* subgraph_helper); + + OpHelper& operator()(const Arguments& input); + OpHelper& operator()(std::initializer_list inputs); + + VarHelper Out(const char* name); + + private: + OpHelper() = delete; + DISABLE_COPY_AND_ASSIGN(OpHelper); + + const char* type_; + proto::OpDesc* op_desc_; + SubgraphHelper* subgraph_helper_; +}; + +/* + * SubgraphHelper is used to define pattern/replace subgraphs. + * + * Use lambda expression to define subgraph like Python. SubgraphHelper + * converts lambda expression to ProgramDesc. + * + * In order to define a subgraph, user need to use VarHelper and OpHelper. + * Use the macros instead of class names, so user can develop better and + * don't need to know too much about underlying implementation. + * + * An example of defining a subgraph as follows: + * + * SUBGRAPH_(subgraph)([subgraph=&subgraph](VAR_(x), VAR_(y), VAR_(z)) { + * auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + * auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + * return ewadd2; + * }); + * + */ +class SubgraphHelper { + public: + SubgraphHelper() = default; + // The lambda expression is a prvalue expression. + template + SubgraphHelper& operator=(const T&& f) { + proto::BlockDesc* block = program_desc_.add_blocks(); + block->set_idx(0); + block->set_parent_idx(0); + AddOutputVars(f()); + return *this; + } + + proto::ProgramDesc* ProgramDesc(); + const proto::ProgramDesc& ProgramDesc() const; + const std::vector& InputVars() const; + const std::vector& OutputVars() const; + + void AddInputVar(const std::string& name); + + void AddOutputVars(const VarHelper& var_helper); + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + AddOutputVars(outputs); + } + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + } + + template + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars<0>(outputs); + } + + private: + DISABLE_COPY_AND_ASSIGN(SubgraphHelper); + std::vector input_vars_; + std::vector output_vars_; + proto::ProgramDesc program_desc_; +}; + +} // namespace generate_pass + +class PassPairs { + public: + using SubgraphType = generate_pass::SubgraphHelper; + + PassPairs() = default; + PassPairs(const SubgraphType& pattern, const SubgraphType& replace); + + void AddPassDesc(const SubgraphType& pattern, const SubgraphType& replace); + + const proto::MultiPassDesc& MultiPassDesc() const; + + private: + proto::MultiPassDesc multi_pass_desc_; +}; + +// Use function to register in CC. +template +class MacroPassHelper : public GeneratePass { + public: + MacroPassHelper() : GeneratePass(Functor().MultiPassDesc()) {} +}; + +#define VAR_(name) \ + ::paddle::framework::ir::generate_pass::VarHelper name = \ + ::paddle::framework::ir::generate_pass::VarHelper(#name) +#define OP_(type) \ + ::paddle::framework::ir::generate_pass::OpHelper(#type, subgraph) +#define SUBGRAPH_(name) \ + ::paddle::framework::ir::generate_pass::SubgraphHelper name; \ + name + +#define REGISTER_GENERATE_PASS(pass_type) \ + paddle::framework::ir::PassPairs register_##pass_type(); \ + REGISTER_PASS( \ + pass_type, \ + ::paddle::framework::ir::MacroPassHelper<®ister_##pass_type>); \ + paddle::framework::ir::PassPairs register_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc index c3852d29c308f..6876dde50c157 100644 --- a/paddle/fluid/framework/ir/generate_pass_tester.cc +++ b/paddle/fluid/framework/ir/generate_pass_tester.cc @@ -16,234 +16,71 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -namespace paddle { -namespace framework { -namespace ir { - -template -class CXXGeneratePass : public GeneratePass { - public: - CXXGeneratePass() : GeneratePass(Functor()) {} -}; - -#define REGISTER_GENERATE_PASS(pass_type, function) \ - REGISTER_PASS(pass_type, ::paddle::framework::ir::CXXGeneratePass<&function>) - -proto::MultiPassDesc generate_fc_fuse() { - proto::MultiPassDesc multi_pass_desc; +REGISTER_GENERATE_PASS(generate_fc_fuse) { + paddle::framework::ir::PassPairs pass_pairs; for (bool with_relu : {true, false}) { - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - pattern->set_idx(0); - pattern->set_parent_idx(0); - proto::OpDesc* mul = pattern->add_ops(); - mul->set_type("mul"); - proto::OpDesc::Var* mul_x = mul->add_inputs(); - mul_x->set_parameter("X"); - mul_x->add_arguments()->assign("x"); - proto::OpDesc::Var* mul_y = mul->add_inputs(); - mul_y->set_parameter("Y"); - mul_y->add_arguments()->assign("w"); - proto::OpDesc::Var* mul_out = mul->add_outputs(); - mul_out->set_parameter("Out"); - mul_out->add_arguments()->assign("mul_out"); - proto::OpDesc* ewadd = pattern->add_ops(); - ewadd->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_x = ewadd->add_inputs(); - ewadd_x->set_parameter("X"); - ewadd_x->add_arguments()->assign("mul_out"); - proto::OpDesc::Var* ewadd_y = ewadd->add_inputs(); - ewadd_y->set_parameter("Y"); - ewadd_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_out = ewadd->add_outputs(); - ewadd_out->set_parameter("Out"); - ewadd_out->add_arguments()->assign("ewadd_out"); - proto::OpDesc* relu = nullptr; - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - replace->set_idx(0); - replace->set_parent_idx(0); - proto::OpDesc* fc = replace->add_ops(); - fc->set_type("fc"); - proto::OpDesc::Var* fc_x = fc->add_inputs(); - fc_x->set_parameter("Input"); - fc_x->add_arguments()->assign("x"); - proto::OpDesc::Var* fc_w = fc->add_inputs(); - fc_w->set_parameter("W"); - fc_w->add_arguments()->assign("w"); - proto::OpDesc::Var* fc_b = fc->add_inputs(); - fc_b->set_parameter("Bias"); - fc_b->add_arguments()->assign("b"); - proto::OpDesc::Var* fc_out = fc->add_outputs(); - fc_out->set_parameter("Out"); - fc_out->add_arguments()->assign("fc_out"); - for (const char* var : {"x", "w", "b", "fc_out"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - proto::PassDesc::AttrMap* attr_map = pass_desc->add_attr_maps(); - attr_map->set_pattern_op_idx(0); - attr_map->set_pattern_name("x_num_col_dims"); - attr_map->set_replace_op_idx(0); - attr_map->set_replace_name("in_num_col_dims"); - if (with_relu) { - relu = pattern->add_ops(); - relu->set_type("relu"); - proto::OpDesc::Var* relu_x = relu->add_inputs(); - relu_x->set_parameter("X"); - relu_x->add_arguments()->assign("ewadd_out"); - proto::OpDesc::Var* relu_out = relu->add_outputs(); - relu_out->set_parameter("Out"); - relu_out->add_arguments()->assign("relu_out"); - pass_desc->mutable_var_maps(3)->set_pattern_var("relu_out"); - proto::OpDesc::Attr* attr = fc->add_attrs(); - attr->set_name("activation_type"); - attr->set_type(proto::AttrType::STRING); - attr->set_s("relu"); - } else { - pass_desc->mutable_var_maps(3)->set_pattern_var("ewadd_out"); - } + // pattern + SUBGRAPH_(pattern) = + [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + VLOG(3) << "exec lambda func."; + auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out"); + if (with_relu) { + return OP_(relu)({"X", ewadd}).Out("Out"); + } else { + return ewadd; + } + }; + // replace + SUBGRAPH_(replace) = + [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}}); + return fc.Out("Out"); + }; + pass_pairs.AddPassDesc(pattern, replace); } - return multi_pass_desc; + return pass_pairs; } -proto::MultiPassDesc generate_multi_add_to_addn() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* ewadd_0 = pattern->add_ops(); - ewadd_0->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_0_x = ewadd_0->add_inputs(); - ewadd_0_x->set_parameter("X"); - ewadd_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* ewadd_0_y = ewadd_0->add_inputs(); - ewadd_0_y->set_parameter("Y"); - ewadd_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_0_out = ewadd_0->add_outputs(); - ewadd_0_out->set_parameter("Out"); - ewadd_0_out->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc* ewadd_1 = pattern->add_ops(); - ewadd_1->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_1_x = ewadd_1->add_inputs(); - ewadd_1_x->set_parameter("X"); - ewadd_1_x->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc::Var* ewadd_1_y = ewadd_1->add_inputs(); - ewadd_1_y->set_parameter("Y"); - ewadd_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* ewadd_1_out = ewadd_1->add_outputs(); - ewadd_1_out->set_parameter("Out"); - ewadd_1_out->add_arguments()->assign("ewadd_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* addn = replace->add_ops(); - addn->set_type("add_n"); - proto::OpDesc::Var* addn_x = addn->add_inputs(); - addn_x->set_parameter("X"); - addn_x->add_arguments()->assign("a"); - addn_x->add_arguments()->assign("b"); - addn_x->add_arguments()->assign("c"); - proto::OpDesc::Var* addn_out = addn->add_outputs(); - addn_out->set_parameter("Out"); - addn_out->add_arguments()->assign("addn_out"); - for (const char* var : {"a", "b", "c", "ewadd_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("addn_out"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_multi_add_to_addn) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + return ewadd2; + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + return OP_(sum)({"X", {x, y, z}}).Out("Out"); + }; + return {pattern, replace}; } -proto::MultiPassDesc generate_combine_matmul() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* matmul_0 = pattern->add_ops(); - matmul_0->set_type("matmul"); - proto::OpDesc::Var* matmul_0_x = matmul_0->add_inputs(); - matmul_0_x->set_parameter("X"); - matmul_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_0_y = matmul_0->add_inputs(); - matmul_0_y->set_parameter("Y"); - matmul_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* matmul_0_out = matmul_0->add_outputs(); - matmul_0_out->set_parameter("Out"); - matmul_0_out->add_arguments()->assign("matmul_out_0"); - proto::OpDesc* matmul_1 = pattern->add_ops(); - matmul_1->set_type("matmul"); - proto::OpDesc::Var* matmul_1_x = matmul_1->add_inputs(); - matmul_1_x->set_parameter("X"); - matmul_1_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_1_y = matmul_1->add_inputs(); - matmul_1_y->set_parameter("Y"); - matmul_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* matmul_1_out = matmul_1->add_outputs(); - matmul_1_out->set_parameter("Out"); - matmul_1_out->add_arguments()->assign("matmul_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* concat = replace->add_ops(); - concat->set_type("concat"); - proto::OpDesc::Var* concat_x = concat->add_inputs(); - concat_x->set_parameter("X"); - concat_x->add_arguments()->assign("b"); - concat_x->add_arguments()->assign("c"); - proto::OpDesc::Var* concat_out = concat->add_outputs(); - concat_out->set_parameter("Out"); - concat_out->add_arguments()->assign("concat_out"); - proto::OpDesc* matmul = replace->add_ops(); - matmul->set_type("matmul"); - proto::OpDesc::Var* matmul_x = matmul->add_inputs(); - matmul_x->set_parameter("X"); - matmul_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_y = matmul->add_inputs(); - matmul_y->set_parameter("Y"); - matmul_y->add_arguments()->assign("concat_out"); - proto::OpDesc::Var* matmul_out = matmul->add_outputs(); - matmul_out->set_parameter("Out"); - matmul_out->add_arguments()->assign("matmul_out"); - proto::OpDesc* slice_0 = replace->add_ops(); - slice_0->set_type("slice"); - proto::OpDesc::Var* slice_0_x = slice_0->add_inputs(); - slice_0_x->set_parameter("X"); - slice_0_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_0_out = slice_0->add_outputs(); - slice_0_out->set_parameter("Out"); - slice_0_out->add_arguments()->assign("slice_out_0"); - proto::OpDesc* slice_1 = replace->add_ops(); - slice_1->set_type("slice"); - proto::OpDesc::Var* slice_1_x = slice_1->add_inputs(); - slice_1_x->set_parameter("X"); - slice_1_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_1_out = slice_1->add_outputs(); - slice_1_out->set_parameter("Out"); - slice_1_out->add_arguments()->assign("slice_out_1"); - for (const char* var : {"a", "b", "c", "matmul_out_0", "matmul_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("slice_out_0"); - pass_desc->mutable_var_maps(4)->set_replace_var("slice_out_1"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_combine_matmul) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto matmul1 = OP_(matmul)({{"X", x}, {"Y", y}}).Out("Out"); + auto matmul2 = OP_(matmul)({{"X", x}, {"Y", z}}).Out("Out"); + return std::make_tuple(matmul1, matmul2); + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + auto concat = OP_(concat)({"X", {y, z}}).Out("Out"); + auto matmul = OP_(matmul)({{"X", x}, {"Y", concat}}).Out("Out"); + auto slice1 = OP_(slice)({"X", matmul}).Out("Out"); + auto slice2 = OP_(slice)({"X", matmul}).Out("Out"); + return std::make_tuple(slice1, slice2); + }; + return {pattern, replace}; } -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_GENERATE_PASS(generate_fc_fuse, - paddle::framework::ir::generate_fc_fuse); -REGISTER_GENERATE_PASS(generate_multi_add_to_addn, - paddle::framework::ir::generate_multi_add_to_addn); -REGISTER_GENERATE_PASS(generate_combine_matmul, - paddle::framework::ir::generate_combine_matmul); - namespace paddle { namespace framework { namespace ir { TEST(GeneratePass, construct_with_string) { std::string binary_str; - generate_fc_fuse().SerializeToString(&binary_str); + register_generate_fc_fuse().MultiPassDesc().SerializeToString(&binary_str); GeneratePass generate_pass(binary_str); } @@ -318,7 +155,7 @@ TEST(GeneratePass, generate_multi_add_to_addn) { graph.reset(pass->Apply(graph.release())); int num_nodes_after = graph->Nodes().size(); - int num_addn_nodes_after = GetNumOpNodes(graph, "add_n"); + int num_addn_nodes_after = GetNumOpNodes(graph, "sum"); VLOG(3) << DebugString(graph); PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 2, diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 4150d0ca555c9..449849762cb10 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2263,15 +2263,34 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "conv2d_transpose", - "elementwise_add", "elementwise_mul", "fc", - "fusion_gru", "fusion_lstm", "gelu", - "layer_norm", "matmul", "matmul_v2", - "pool2d", "prelu", "relu", - "reshape2", "softmax", "split", - "squeeze", "squeeze2", "sum", - "transpose2"}); + std::unordered_set({"cast", + "clip", + "concat", + "conv2d", + "conv2d_transpose", + "elementwise_add", + "elementwise_mul", + "expand_v2", + "fc", + "fusion_gru", + "fusion_lstm", + "gelu", + "layer_norm", + "matmul", + "matmul_v2", + "pool2d", + "prelu", + "relu", + "reshape2", + "scale", + "sigmoid", + "slice", + "softmax", + "split", + "squeeze", + "squeeze2", + "sum", + "transpose2"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 25bf03f426a1d..a97873e82f455 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign", "silu"}; + "softsign", "silu", "mish"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt index 6764799d82866..fea12baf0651f 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) +cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc index 70b95c9154fd3..afd80e45cf65e 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" @@ -21,14 +22,23 @@ namespace paddle { namespace framework { namespace ir { +template +static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base, + const platform::Place &place) { + auto *op = dynamic_cast(op_base); + return op && op->GetPlace() == place; +} + static bool IsLockAndRecordEventFreeComputationOpHandle( details::ComputationOpHandle *op, const OpGraphView &graph_view) { if (!platform::is_gpu_place(op->GetPlace()) && !platform::is_xpu_place(op->GetPlace())) return false; for (auto &pending_op : graph_view.PendingOps(op)) { - auto *tmp = dynamic_cast(pending_op); - if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) { + if (!IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace()) && + !IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace())) { return false; } } diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc new file mode 100644 index 0000000000000..fbf2cfb8d41d6 --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" + +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +namespace paddle { +namespace framework { +namespace ir { + +void PaddleToCinnPass::ApplyImpl(ir::Graph* graph) const { + paddle2cinn::CinnRunner::GetInstance()->ReplaceWithCinn(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(paddle_to_cinn_pass, paddle::framework::ir::PaddleToCinnPass); diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass.h b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h new file mode 100644 index 0000000000000..f3b9bd21ebf9c --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass.h @@ -0,0 +1,30 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +class PaddleToCinnPass : public Pass { + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc new file mode 100644 index 0000000000000..49d2ce295f385 --- /dev/null +++ b/paddle/fluid/framework/ir/paddle_to_cinn_pass_test.cc @@ -0,0 +1,40 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/ir/paddle_to_cinn_pass.h" + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace ir { + +TEST(PaddleToCinnPassTest, TodoTest) { + ProgramDesc program; + Graph graph(program); + + auto pass = paddle::framework::ir::PassRegistry::Instance().Get( + "paddle_to_cinn_pass"); + + pass->Apply(&graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +USE_PASS(paddle_to_cinn_pass); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5958728946c2e..1864899b07e01 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -548,7 +548,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, std::string new_input = quantized_op_input_node->Name(); std::string new_output = dequant_op_out_node->Name(); - framework::OpDesc new_op_desc(base_op_desc, nullptr); + framework::OpDesc new_op_desc(base_op_desc, + quantized_op_node->Op()->Block()); new_op_desc.SetType(quantized_op_type); new_op_desc.SetAttr("enable_int8", true); if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" || diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 7d9d3d5fef14a..083d989cb5267 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -189,8 +189,6 @@ void InterpreterCore::Convert() { for (auto inst_id : filter_next) { dependecy_count_[inst_id]++; } - vec_instruction_[i].next_instruction_.all_next_ops_ = - std::move(filter_next); } for (size_t i = 0; i < vec_instruction_.size(); ++i) { @@ -356,31 +354,81 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr) { - auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_); - auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); - std::atomic op_run_number{0}; + async_work_queue_.PrepareAtomicDeps(dependecy_count_); + async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); + op_run_number_ = 0; for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() { - RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number); - }); + async_work_queue_.AddTask(vec_instr[i].type_, + [&, i] { RunInstructionAsync(i); }); } } async_work_queue_.WaitEmpty(); PADDLE_ENFORCE_EQ( - op_run_number.load(), vec_instr.size(), + op_run_number_.load(), vec_instr.size(), platform::errors::Fatal( "Required op_run_number == %d, but received op_run_number = %d.", - vec_instr.size(), op_run_number.load())); + vec_instr.size(), op_run_number_.load())); } -void InterpreterCore::RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* atomic_deps, - AtomicVectorSizeT* atomic_var_ref, - std::atomic* op_run_number) { +void InterpreterCore::RunNextInstruction(const Instruction& instr) { + auto& next_instr = instr.next_instruction_; + auto& atomic_deps = async_work_queue_.AtomicDeps(); + auto IsReady = [&](size_t next_id) { + return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1; + }; + + if (instr.type_ == OpFuncType::kQueueAsync) { + // move all sync_ops into other threads + for (auto next_id : next_instr.synchronize_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + // keep all async_ops running in current thread + for (auto next_id : next_instr.direct_run_) { + if (IsReady(next_id)) { + RunInstructionAsync(next_id); + } + } + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + RunInstructionAsync(next_id); + } + } + } else { + // move async_ops into async_thread + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + + for (size_t i = 0; i < next_instr.direct_run_.size(); ++i) { + auto next_id = next_instr.direct_run_[i]; + if (IsReady(next_id)) { + // only keep one op running in current thread + if (i == 0) { + RunInstructionAsync(next_id); + continue; + } + // move rest ops into other threads + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + } +} + +void InterpreterCore::RunInstructionAsync(size_t instr_id) { auto& instr_node = vec_instruction_[instr_id]; platform::RecordEvent instruction_event( instr_node.kernel_func_.operator_base_->Type()); @@ -389,32 +437,22 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id, RunInstruction(instr_node); event_manager_.RecordEvent(instr_node, place_); - op_run_number->fetch_add(1, std::memory_order_relaxed); + op_run_number_.fetch_add(1, std::memory_order_relaxed); - auto& next_instr = instr_node.next_instruction_.all_next_ops_; - - for (auto next_i : next_instr) { - // fetch_sub return value before applying sub - bool is_ready = - atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1; - if (is_ready) { - async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() { - RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number); - }); - } - } // GC infomation - CheckGC(instr_id, instr_node.gc_check_var_list, atomic_var_ref); + CheckGC(instr_id, instr_node.gc_check_var_list); + + RunNextInstruction(instr_node); } void InterpreterCore::CheckGC(size_t instr_id, - const std::vector& gc_check_list, - AtomicVectorSizeT* atomic_var_ref) { + const std::vector& gc_check_list) { auto& var_scope = *global_scope_; + auto& atomic_var_ref = async_work_queue_.AtomicVarRef(); for (auto var_id : gc_check_list) { - bool is_ready = atomic_var_ref->at(var_id)->fetch_sub( - 1, std::memory_order_relaxed) == 1; + bool is_ready = + atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1; if (is_ready && var_scope.vec_meta_info_[var_id].vardesc_ && !var_scope.vec_meta_info_[var_id].vardesc_->Persistable()) { gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id], diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index e594f9ca8b54b..47f23aff4f00e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -65,13 +65,10 @@ class InterpreterCore { void DryRunPrepare(const std::vector& feed_tensors); - void CheckGC(size_t instr_id, const std::vector& gc_check_list, - AtomicVectorSizeT* working_var_ref); + void CheckGC(size_t instr_id, const std::vector& gc_check_list); - void RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* working_dependecy_count, - AtomicVectorSizeT* working_var_ref, - std::atomic* op_run_number); + void RunInstructionAsync(size_t instr_id); + void RunNextInstruction(const Instruction& instr_id); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); @@ -101,6 +98,7 @@ class InterpreterCore { InterpreterCoreGarbageCollector gc_; std::vector gc_event_; + std::atomic op_run_number_{0}; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 16df5d794f4d4..3438fc3bd4dcd 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -12,31 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore_util.h" +#include + #include "paddle/fluid/framework/executor_gc_helper.h" namespace paddle { namespace framework { namespace interpretercore { -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicDeps( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps( const std::vector& dependecy_count) { - AtomicVectorSizeT working_dependecy_count(dependecy_count.size()); + if (atomic_deps_.size() != dependecy_count.size()) { + atomic_deps_.clear(); + std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(), + [] { return std::make_unique>(0); }); + } + for (size_t i = 0; i < dependecy_count.size(); ++i) { - working_dependecy_count[i] = - std::make_unique>(dependecy_count[i]); + atomic_deps_[i]->store(dependecy_count[i]); } - return working_dependecy_count; + return atomic_deps_; } -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicVarRef( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef( const std::vector& vec_meta_info) { - AtomicVectorSizeT working_var_ref(vec_meta_info.size()); + if (atomic_var_ref_.size() != vec_meta_info.size()) { + atomic_var_ref_.clear(); + std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(), + [] { return std::make_unique>(0); }); + } for (size_t i = 0; i < vec_meta_info.size(); ++i) { - working_var_ref[i] = - std::make_unique>(vec_meta_info[i].var_ref_count_); + atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_); } - return working_var_ref; + return atomic_var_ref_; } bool var_can_be_deleted(const std::string& name, const BlockDesc& block) { diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 259f1c615533d..2a5942c712365 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -66,9 +66,9 @@ class AsyncWorkQueue { queue_group_ = CreateWorkQueueGroup(group_options); } - AtomicVectorSizeT PrepareAtomicDeps( + AtomicVectorSizeT& PrepareAtomicDeps( const std::vector& dependecy_count); - AtomicVectorSizeT PrepareAtomicVarRef( + AtomicVectorSizeT& PrepareAtomicVarRef( const std::vector& vec_meta_info); void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } @@ -77,9 +77,14 @@ class AsyncWorkQueue { queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); } + AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; } + AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; } + private: size_t host_num_thread_; std::unique_ptr queue_group_; + AtomicVectorSizeT atomic_deps_; + AtomicVectorSizeT atomic_var_ref_; }; std::string get_memcpy_type(const platform::Place& src_place, diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 9c0444b3157cb..19b7b6d5dc299 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -477,15 +477,10 @@ struct VariableScope { std::vector vec_meta_info_; }; -struct EventRun { - explicit EventRun(size_t op_id) : op_id_(op_id) {} - size_t op_id_; -}; struct NextInstruction { std::vector direct_run_; - std::vector event_wait_run_; - std::vector synchronize_run_; - std::vector all_next_ops_; + std::vector event_wait_run_; + std::vector synchronize_run_; }; struct EventInter { diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h index 1d32eb05d4d51..e457b20a3c35d 100644 --- a/paddle/fluid/framework/new_executor/run_queue.h +++ b/paddle/fluid/framework/new_executor/run_queue.h @@ -38,6 +38,7 @@ #include #include #include "paddle/fluid/framework/new_executor/workqueue_utils.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" namespace paddle { namespace framework { @@ -102,7 +103,7 @@ class RunQueue { // PushBack adds w at the end of the queue. // If queue is full returns w, otherwise returns default-constructed Work. Work PushBack(Work w) { - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[(back - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -124,7 +125,7 @@ class RunQueue { return Work(); } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[back & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -146,7 +147,7 @@ class RunQueue { return 0; } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); unsigned size = Size(); unsigned mid = back; @@ -214,7 +215,7 @@ class RunQueue { // modification counters. alignas(64) std::atomic front_; alignas(64) std::atomic back_; - SpinLock mutex_; + paddle::memory::SpinLock mutex_; Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 0eafbb027f042..9470fd9b69933 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -164,7 +164,7 @@ class OpDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: template diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 670cb36dcc3ab..2a543d48791a3 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1589,14 +1589,15 @@ void OperatorWithKernel::ParseInputDataType( "not initialized.", Type(), name, ctx.InputNames(name).at(i))); proto::VarType::Type tmp = t->type(); - PADDLE_ENFORCE( - tmp == *data_type || *data_type == default_data_type, - platform::errors::InvalidArgument( - "The DataType of %s Op's duplicable Variable %s must be " - "consistent. The current variable type is (%s), but the " - "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), - DataTypeToString(*data_type))); + PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, + platform::errors::InvalidArgument( + "The DataType of %s Op's duplicable or different " + "slot Variable %s must be " + "consistent or reigster GetExpectedKernelType. The " + "current variable type is (%s), but the " + "previous variable type is (%s).", + Type(), name, DataTypeToString(tmp), + DataTypeToString(*data_type))); *data_type = tmp; } } diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index 68edb7c89dd87..ab812a30981f0 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include "glog/logging.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt new file mode 100644 index 0000000000000..8621c7363a09f --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -0,0 +1,7 @@ +cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc) +cc_library(cinn_compiled_object SRCS cinn_compiled_object.cc DEPS feed_fetch_method graph lod_tensor proto_desc) +cc_library(cinn_runner SRCS cinn_runner.cc DEPS cinn_cache_key cinn_compiled_object feed_fetch_method graph lod_tensor scope) + +cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) +cc_test(cinn_runner_test SRCS cinn_runner_test.cc DEPS cinn_runner proto_desc) +cc_test(cinn_compiled_object_test SRCS cinn_compiled_object_test.cc DEPS cinn_compiled_object) diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc new file mode 100644 index 0000000000000..ac6c83be4fae3 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc @@ -0,0 +1,87 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +CinnCacheKey::CinnCacheKey( + const ir::Graph& graph, + const std::map& feed_tensors) { + this->SetKey(graph, feed_tensors); +} + +CinnCacheKey::CinnCacheKey(const ir::Graph& graph, + const std::map& feed_shapes) { + this->SetKey(graph, feed_shapes); +} + +void CinnCacheKey::SetKey( + const ir::Graph& graph, + const std::map& feed_tensors) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + for (const auto& name_tensor : feed_tensors) { + feed_shapes_[name_tensor.first] = name_tensor.second->dims(); + } +} + +void CinnCacheKey::SetKey(const ir::Graph& graph, + const std::map& feed_shapes) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + feed_shapes_ = feed_shapes; +} + +bool CinnCacheKey::operator!=(const CinnCacheKey& other) const { + return !this->operator==(other); +} + +bool CinnCacheKey::operator==(const CinnCacheKey& other) const { + return graph_serialize_str_ == other.graph_serialize_str_ && + feed_shapes_ == other.feed_shapes_; +} + +size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) { + return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const { + std::size_t ret = 0; + + std::hash string_hasher; + for (const auto& name_shape : key.feed_shapes_) { + ret = hash_combine(ret, string_hasher(name_shape.first)); + ret = hash_combine(ret, string_hasher(name_shape.second.to_str())); + } + + ret = hash_combine(ret, string_hasher(key.graph_serialize_str_)); + return ret; +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h new file mode 100644 index 0000000000000..9627ae92aaba2 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h @@ -0,0 +1,63 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Class to store the keys for compiling CINN. +// +// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. +// +// The CinnCacheKey contains a graph serialized string and the feeded tensor +// shapes. +class CinnCacheKey { + public: + CinnCacheKey(const ir::Graph& graph, + const std::map& feed_tensors); + CinnCacheKey(const ir::Graph& graph, + const std::map& feed_shapes); + + ~CinnCacheKey() {} + + void SetKey(const ir::Graph& graph, + const std::map& feed_tensors); + void SetKey(const ir::Graph& graph, + const std::map& feed_shapes); + + bool operator==(const CinnCacheKey& other) const; + bool operator!=(const CinnCacheKey& other) const; + + struct Hash { + static size_t hash_combine(size_t seed, size_t value); + size_t operator()(const CinnCacheKey& key) const; + }; + + private: + std::string graph_serialize_str_; + std::map feed_shapes_; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc new file mode 100644 index 0000000000000..a84ade26bfd12 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnCacheKeyTest, TestAsUnorderedKey) { + std::unordered_set test_set; + + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + + ProgramDesc program; + auto *global_block = program.MutableBlock(0); + auto *x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph(program); + + LoDTensor tensor; + tensor.Resize({1, 2, 3}); + const LoDTensor *tensor_pointer = &tensor; + std::map feed_tensors = { + {"X", tensor_pointer}}; + + DDim ddim = paddle::framework::make_ddim({1, 2, 3}); + std::map feed_shapes = {{"X", ddim}}; + + CinnCacheKey cache_key1(empty_graph, feed_tensors); + CinnCacheKey cache_key2(empty_graph, feed_shapes); + EXPECT_EQ(cache_key1, cache_key2); + + CinnCacheKey cache_key3(graph, feed_shapes); + CinnCacheKey cache_key4(graph, feed_tensors); + EXPECT_EQ(cache_key3, cache_key4); + + CinnCacheKey cache_key5(empty_graph, + std::map()); + CinnCacheKey cache_key6(empty_graph, std::map()); + EXPECT_EQ(cache_key5, cache_key6); + + EXPECT_NE(cache_key1, cache_key3); + EXPECT_NE(cache_key4, cache_key2); + + EXPECT_NE(cache_key3, cache_key5); + EXPECT_NE(cache_key6, cache_key4); + + EXPECT_NE(cache_key5, cache_key1); + EXPECT_NE(cache_key2, cache_key6); + + test_set.insert(cache_key1); + test_set.insert(cache_key2); + test_set.insert(cache_key3); + test_set.insert(cache_key4); + test_set.insert(cache_key5); + test_set.insert(cache_key6); + EXPECT_EQ(test_set.size(), 3U); + + auto iter = test_set.find(cache_key1); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 2U); + EXPECT_EQ(test_set.find(cache_key2), test_set.end()); + + iter = test_set.find(cache_key3); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 1U); + EXPECT_EQ(test_set.find(cache_key4), test_set.end()); + + iter = test_set.find(cache_key5); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 0U); + EXPECT_EQ(test_set.find(cache_key6), test_set.end()); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc new file mode 100644 index 0000000000000..a90494bafe9bb --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +CinnCompiledObject::CinnCompiledObject() { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} +CinnCompiledObject::~CinnCompiledObject() { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} + +void CinnCompiledObject::Compile( + const ir::Graph& graph, + std::map* feed_targets) { + // TODO(zhhsplendid): complete this function after CINN interface is ready +} + +std::map CinnCompiledObject::Run( + Scope* scope, std::map* feed_targets) { + // TODO(zhhsplendid): complete this function after CINN interface is ready + return std::map(); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h new file mode 100644 index 0000000000000..21191d4434587 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h @@ -0,0 +1,50 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Class to store and call CINN complied object +class CinnCompiledObject { + public: + CinnCompiledObject(); + ~CinnCompiledObject(); + + // Compiles use CINN. CINN compilation needs model graph, input names, and + // input_shapes + void Compile(const ir::Graph& graph, + std::map* feed_targets); + + // Feed LoDTensors to tun CINN compiled object and return fetched result + std::map Run( + Scope* scope, std::map* feed_targets); + + // Converts compiled object to Paddle Graph + // To be discussed + // ir::Graph ToGraph(); +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc new file mode 100644 index 0000000000000..5a7861edf210c --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiled_object_test.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnCompiledObjecctTest, TodoTest) { + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + std::map empty_feed; + Scope empty_scope; + + CinnCompiledObject compiled_obj; + compiled_obj.Compile(empty_graph, &empty_feed); + auto fetch = compiled_obj.Run(&empty_scope, &empty_feed); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc new file mode 100644 index 0000000000000..ba90095cae679 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; + +std::once_flag CinnRunner::get_instance_once_flag_; +std::shared_ptr CinnRunner::instance_; + +std::shared_ptr CinnRunner::GetInstance() { + std::call_once(get_instance_once_flag_, + [&]() { instance_.reset(new CinnRunner()); }); + return instance_; +} + +void CinnRunner::ReplaceWithCinn(Graph* graph) { + // TODO(zhhsplendid): call CINN Api when it is ready +} + +std::map CinnRunner::Run( + const Graph& graph, Scope* scope, + std::map* feed_targets) { + CinnCacheKey cur_key(graph, *feed_targets); + std::shared_ptr obj_to_run; + if (cache_.find(cur_key) != cache_.end()) { + obj_to_run = cache_[cur_key]; + } else { + obj_to_run = std::make_shared(); + obj_to_run->Compile(graph, feed_targets); + cache_[cur_key] = obj_to_run; + } + return obj_to_run->Run(scope, feed_targets); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner.h b/paddle/fluid/framework/paddle2cinn/cinn_runner.h new file mode 100644 index 0000000000000..23d9565d2f392 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner.h @@ -0,0 +1,65 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiled_object.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Entrance to run CINN. +// +// CINN cannot handle changable shape now, so CinnRunner keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. If cache hits, we will re-use cache +// stored CinnCompiledObject, otherwise we will compile again and put into +// cache. +class CinnRunner { + public: + ~CinnRunner() {} + + // Singleton + static std::shared_ptr GetInstance(); + + // Replace Paddle graph with some CINN subgraphs/ops + void ReplaceWithCinn(ir::Graph* graph); + + // Feed LoDTensors to tun CINN compiled object and return fetched result + std::map Run( + const ir::Graph& graph, Scope* scope, + std::map* feed_targets); + + private: + CinnRunner() {} + + static std::once_flag get_instance_once_flag_; + static std::shared_ptr instance_; + std::unordered_map, + CinnCacheKey::Hash> + cache_; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc new file mode 100644 index 0000000000000..c02b994c147ca --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_runner_test.cc @@ -0,0 +1,44 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" + +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; + +TEST(CinnRunnerTest, TodoTest) { + ProgramDesc empty_program; + Graph empty_graph(empty_program); + Scope empty_scope; + std::map empty_feed; + + std::shared_ptr cinn_runner = CinnRunner::GetInstance(); + cinn_runner->ReplaceWithCinn(&empty_graph); + cinn_runner->Run(empty_graph, &empty_scope, &empty_feed); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index adbbfb380bc45..3b80e9c78677d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,13 +27,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_runner.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" @@ -41,8 +44,13 @@ limitations under the License. */ #include "paddle/fluid/platform/cuda_device_guard.h" #endif +DECLARE_bool(use_cinn); DECLARE_double(eager_delete_tensor_gb); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DECLARE_bool(sync_nccl_allreduce); +#endif + #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif @@ -669,6 +677,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // ncclOp std::vector async_graphs = CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); + PrepareForCUDAGraphCapture(graph); graph = member_->ApplyMemoryOptimizePass(graph); async_graphs[0] = graph; @@ -882,6 +891,23 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter ParallelExecutor Run"; +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + PADDLE_ENFORCE_EQ( + member_->build_strategy_.allow_cuda_graph_capture_, true, + platform::errors::InvalidArgument( + "You must turn on build_strategy.allow_cuda_graph_capture = True " + "to enable CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + member_->places_[0], platform::CUDAGraphCapturingPlace(), + platform::errors::InvalidArgument("The place to capture CUDAGraph is " + "not the same as the place to run.")); + } +#endif + #ifdef WITH_GPERFTOOLS if (gProfileStarted) { ProfilerFlush(); @@ -919,6 +945,40 @@ void ParallelExecutor::RunWithoutFetch( member_->executor_->Run(/*fetch_tensors*/ {}, /*return_merged*/ false); } +FetchResultType ParallelExecutor::RunFromCinn( + const std::unordered_map &feed_tensors, + const std::vector &fetch_names) { + // Feed tensor to scope, now only support 1 scope + // TODO(zhhsplendid): handle multiple scope + size_t scope_id = 0; + std::map cinn_input_tensors; + for (auto &name_tensor_pair : feed_tensors) { + bool is_persistable = member_->IsPersistable(name_tensor_pair.first); + if (!is_persistable) { + member_->SetSkipMemoryReuse(scope_id, name_tensor_pair.first); + } + Scope *feed_scope = is_persistable ? member_->local_scopes_[scope_id] + : member_->local_exec_scopes_[scope_id]; + Variable *feed_var = feed_scope->Var(name_tensor_pair.first); + LoDTensor *trg = feed_var->GetMutable(); + trg->ShareDataWith(name_tensor_pair.second); + trg->set_lod(name_tensor_pair.second.lod()); + + cinn_input_tensors[name_tensor_pair.first] = trg; + } + + // TODO(zhhsplendid): get correct API after CINN API is ready + // now only return empty fetch result; + std::shared_ptr cinn_runner = + paddle2cinn::CinnRunner::GetInstance(); + + cinn_runner->Run(Graph(), member_->local_exec_scopes_[scope_id], + &cinn_input_tensors); + + paddle::framework::FetchResultType fetches = FetchList(fetch_names.size()); + return fetches; +} + void ParallelExecutor::SkipMemoryReuse( size_t scope_idx, const std::vector &skip_vars) { for (auto &var_name : skip_vars) { @@ -932,6 +992,16 @@ void ParallelExecutor::SkipMemoryReuse( void ParallelExecutor::FeedTensorsIntoLocalScopes( const std::vector> &tensors) { + if (platform::IsCUDAGraphCapturing()) { + for (auto &tensor : tensors) { + PADDLE_ENFORCE_EQ( + tensor.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + } + return; + } + if (!member_->AllowPartialFeed()) { PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(), platform::errors::Unimplemented( @@ -987,6 +1057,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors) { + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ( + tensors.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + return; + } + size_t num_places = member_->places_.size(); bool allow_partial_feed = member_->AllowPartialFeed(); @@ -1568,6 +1646,107 @@ const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } +void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { + const auto &build_strategy = member_->build_strategy_; + if (!build_strategy.allow_cuda_graph_capture_) return; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_EQ( + build_strategy.async_mode_, false, + platform::errors::InvalidArgument( + "Async Executor does not support CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + platform::IsCUDAGraphCapturing(), false, + platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " + "when running the first batch.")); + PADDLE_ENFORCE_EQ( + member_->places_.size(), 1, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when one GPU device is running.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true, + platform::errors::InvalidArgument( + "CUDA Graph is only supported on NVIDIA GPU device.")); + PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false, + platform::errors::InvalidArgument( + "FLAGS_sync_nccl_allreduce must be False to support " + "CUDA Graph capturing.")); + + std::unordered_map> all_vars; + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + auto *var_desc = node->Var(); + all_vars[var_desc->Name()].emplace_back(var_desc); + } + } + + auto mark_var_as_persistable = [&all_vars](const std::string &name) { + auto iter = all_vars.find(name); + if (iter != all_vars.end()) { + for (auto *var_desc : iter->second) { + var_desc->SetPersistable(true); + } + } + }; + + // Step 1: All fused vars must be persistable. + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + fused_var.second.persistable_ = true; + mark_var_as_persistable(fused_var.first); + } + } + + // Step 2: All pinned vars must be persistable. + if (graph->Has(details::kPinnedVars)) { + auto &pinned_vars = graph->Get(details::kPinnedVars); + for (auto &pinned_var : pinned_vars) { + mark_var_as_persistable(pinned_var); + } + } + + // Step 3: Move all main programs to startup programs to make sure that + // the main programs would only be run once. + if (graph->Has(details::kProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + auto &main_programs = + graph->Get(details::kProgramDescs); + for (auto &main_program : main_programs) { + startup_programs.emplace_back(main_program); + } + graph->Erase(details::kProgramDescs); + } + + // Step 4: Mark all vars in startup programs to be persistable. + if (graph->Has(details::kStartupProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + for (auto &startup_program : startup_programs) { + for (auto &op_desc : startup_program.Block(0).AllOps()) { + for (auto &output : op_desc->OutputArgumentNames()) { + mark_var_as_persistable(output); + } + } + } + } + + // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy. + auto ops = ir::FilterByNodeWrapper(*graph); + auto *scope = member_->local_scopes_[0]; + for (auto *op : ops) { + auto *loss_grad_op = dynamic_cast(op); + if (loss_grad_op == nullptr) continue; + auto loss_grad_name = loss_grad_op->LossGradName(); + mark_var_as_persistable(loss_grad_name); + loss_grad_op->RunOnVar(scope->Var(loss_grad_name)); + loss_grad_op->SetSkipRunning(true); + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 6c871a8d85815..f908ce3f01393 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include #include @@ -92,6 +93,10 @@ class ParallelExecutor { void RunWithoutFetch(const std::vector &skip_eager_vars); + FetchResultType RunFromCinn( + const std::unordered_map &feed_tensors, + const std::vector &fetch_names); + void ResetOpHandleScopeMapOfGraphs( const std::unordered_map &scope_map); @@ -144,6 +149,8 @@ class ParallelExecutor { void SetReaderOpDeviceInfoOfGraphs( const std::vector &final_graphs); + void PrepareForCUDAGraphCapture(ir::Graph *graph); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; std::vector var_infos_; diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 8b16b6a5d007f..dc7b86d344d77 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -29,9 +29,12 @@ namespace framework { void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { - dataset_ = dataset; + SetDataset(dataset); thread_num_ = trainer_desc.thread_num(); param_ = trainer_desc.downpour_param(); + ParseDumpConfig(trainer_desc); + mpi_rank_ = trainer_desc.mpi_rank(); + mpi_size_ = trainer_desc.mpi_size(); for (int i = 0; i < param_.dense_table_size(); ++i) { uint64_t table_id = static_cast(param_.dense_table(i).table_id()); auto table = param_.dense_table(i); @@ -44,6 +47,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, int place_num = trainer_desc.worker_places_size(); const std::vector readers = dataset->GetReaders(); + dump_file_num_ = trainer_desc.dump_file_num(); + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); std::vector dev_ids; for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); @@ -64,6 +69,11 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); workers_[i]->SetDeviceIndex(i); + workers_[i]->SetNeedDumpField(need_dump_field_); + workers_[i]->SetNeedDumpParam(need_dump_param_); + workers_[i]->SetDumpFieldVector(dump_fields_); + workers_[i]->SetDumpParamVector(dump_param_); + workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->SetDataFeed(readers[i]); workers_[i]->Initialize(trainer_desc); workers_[i]->SetWorkerNum(place_num); @@ -71,7 +81,14 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } -void PSGPUTrainer::DumpWork(int tid) {} +std::string PSGPUTrainer::GetDumpPath(int tid) { + if (user_define_dump_filename_ != "") { + return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(), + user_define_dump_filename_.c_str(), tid); + } + return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(), + mpi_rank_, tid); +} void PSGPUTrainer::RegisterHeterCallback() { /* @@ -124,7 +141,28 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program, return; } +void PSGPUTrainer::InitDumpEnv() { + queue_ = paddle::framework::MakeChannel(); + for (size_t i = 0; i < places_.size(); ++i) { + workers_[i]->SetChannelWriter(queue_.get()); + } + dump_thread_num_ = 1; + if (dump_file_num_ > mpi_size_) { + dump_thread_num_ = dump_file_num_ / mpi_size_; + if (dump_file_num_ % mpi_size_ > mpi_rank_) { + dump_thread_num_ += 1; + } + } + for (int i = 0; i < dump_thread_num_; i++) { + dump_thread_.push_back( + std::thread(std::bind(&TrainerBase::DumpWork, this, i))); + } +} + void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) { + if (need_dump_field_ || need_dump_param_) { + InitDumpEnv(); + } VLOG(3) << "init other env done."; } @@ -204,6 +242,9 @@ void PSGPUTrainer::Finalize() { } } MergeDenseParam(); + if (need_dump_field_ || need_dump_param_) { + FinalizeDumpEnv(); + } root_scope_->DropKids(); } } // namespace framework diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 66d8a40dda160..e41768810c6d2 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -34,11 +34,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); mpi_rank_ = desc.mpi_rank(); trainer_desc_ = desc; - /* - for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { - send_var_list_.push_back(trainer_desc_.xpu_recv_list(i)); - } - */ for (int i = 0; i < param_.sparse_table_size(); ++i) { uint64_t table_id = static_cast(param_.sparse_table(i).table_id()); @@ -89,19 +84,7 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { no_cvm_ = desc.no_cvm(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); - dump_fields_.resize(desc.dump_fields_size()); - for (int i = 0; i < desc.dump_fields_size(); ++i) { - dump_fields_[i] = desc.dump_fields(i); - } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); - need_dump_param_ = false; - dump_param_.resize(desc.dump_param_size()); - for (int i = 0; i < desc.dump_param_size(); ++i) { - dump_param_[i] = desc.dump_param(i); - } - if (desc.dump_param_size() != 0) { - need_dump_param_ = true; - } for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { check_nan_var_names_.push_back(desc.check_nan_var_names(i)); } @@ -134,12 +117,6 @@ void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { writer_.Reset(queue); } -void PSGPUWorker::SetNeedDump(bool need_dump_field) { - need_dump_field_ = need_dump_field; -} - -void PSGPUWorker::DumpParam() {} - void PSGPUWorker::TrainFiles() { platform::SetNumThreads(1); platform::Timer timeline; @@ -150,6 +127,7 @@ void PSGPUWorker::TrainFiles() { // how to accumulate fetched values here device_reader_->Start(); int cur_batch; + int batch_cnt = 0; while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; for (auto& op : ops_) { @@ -164,9 +142,19 @@ void PSGPUWorker::TrainFiles() { op->Run(*thread_scope_, place_); } } + if (need_dump_field_) { + DumpField(*thread_scope_, dump_mode_, dump_interval_); + } + if (need_dump_param_ && thread_id_ == 0) { + DumpParam(*thread_scope_, batch_cnt); + } PrintFetchVars(); thread_scope_->DropKids(); + ++batch_cnt; + } + if (need_dump_field_ || need_dump_param_) { + writer_.Flush(); } timeline.Pause(); VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 4f6eb803d1c26..fbd7aa588d49a 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -29,14 +29,16 @@ void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet( "Tensor holds no memory. " "Call Tensor::mutable_data firstly.")); + size_t size = numel() * SizeOfType(type()); + PADDLE_ENFORCE_LE( - numel() * SizeOfType(type()), memory_size(), + size, memory_size(), platform::errors::PreconditionNotMet( "Tensor's dimension is out of bound." "Tensor's dimension must be equal or less than the size of its " "memory." "But received Tensor's dimension is d%, memory's size is %d.", - numel() * SizeOfType(type()), memory_size())); + size, memory_size())); } Tensor::Tensor(const proto::VarType::Type& dtype) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 15021b6267b65..ee30a82aff6ef 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -1065,6 +1065,9 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, if (type.code == kDLFloat) return static_cast( dst->mutable_data(dst_place)); + if (type.code == kDLBfloat) + return static_cast( + dst->mutable_data(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1081,6 +1084,16 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, return static_cast(dst->mutable_data(dst_place)); if (type.code == kDLFloat) return static_cast(dst->mutable_data(dst_place)); + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); + PADDLE_THROW(platform::errors::Unimplemented( + "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", + type.code, type.bits)); + case 128: + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1107,15 +1120,15 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { auto src_ptr = static_cast(dl_tensor.data); auto size = paddle::framework::product(vddim) * type.bits / 8; - if (dl_tensor.ctx.device_type == kDLCPU) { + if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (dl_tensor.ctx.device_type == kDLGPU) { + if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); platform::CUDAPlace src_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); memory::Copy( diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 0f34c84549f2b..f6e274e6257e4 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -258,13 +258,12 @@ class PSGPUTrainer : public TrainerBase { virtual void Run(); virtual void Finalize(); virtual void RegisterHeterCallback(); - virtual void DumpWork(int tid); virtual Scope* GetWorkerScope(int thread_id); virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual std::string GetDumpPath(int tid) { return ""; } - virtual void InitDumpEnv() {} + virtual std::string GetDumpPath(int tid); + virtual void InitDumpEnv() override; virtual void MergeDenseParam(); template @@ -286,6 +285,9 @@ class PSGPUTrainer : public TrainerBase { std::vector threads_; int use_ps_gpu_; int thread_num_; + int mpi_rank_; + int mpi_size_; + int dump_file_num_; }; #endif diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index d1a1757d5309b..a6f56ad445834 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -160,7 +160,7 @@ class VarDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: const proto::VarType::TensorDesc &tensor_desc() const; diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index c1ec675a55707..45756083c9047 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -307,7 +307,15 @@ static void FillConstantLike(const VariableWrapper &ref_var, auto *dst_tensor = dst_var->MutableVar()->GetMutable(); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); dst_tensor->Resize(ref_tensor.dims()); - dst_tensor->mutable_data(place, ref_var.DataType()); + // TOOD(jiabin): Ugly fix here we have fwd_data_type_ and data_type, since in + // grad mission + // we can't get data_type_ directly. We need to check if we can only use + // default data_type for now. + if (ref_var.ForwardDataType() != -1) { + dst_tensor->mutable_data(place, ref_var.ForwardDataType()); + } else { + dst_tensor->mutable_data(place, ref_var.DataType()); + } operators::math::set_constant(*dev_ctx, dst_tensor, value); } diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 5fa8b89a396d9..758e8e62718e7 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -162,6 +162,7 @@ class VariableWrapper { return tensor->type(); } else { VLOG(6) << "The tensor of variable " << name_ << " is not initialized"; + return data_type_; } } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 804f035a2e2ca..3136e53e74d09 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1410,6 +1410,7 @@ USE_TRT_CONVERTER(reduce_mean); USE_TRT_CONVERTER(tile); USE_TRT_CONVERTER(conv3d); USE_TRT_CONVERTER(conv3d_transpose); +USE_TRT_CONVERTER(mish); #endif namespace paddle_infer { diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index c79915629b70d..f2c7a4b62bbbb 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -17,6 +17,7 @@ nv_library(tensorrt_converter gather_nd_op.cc tile_op.cc conv3d_op.cc + mish_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/mish_op.cc b/paddle/fluid/inference/tensorrt/convert/mish_op.cc new file mode 100644 index 0000000000000..6b646d9935b52 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/mish_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Mish converter from fluid to tensorRT. + */ +class MishOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid Mish op to tensorrt Mish plugin"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + + const float threshold = + op_desc.HasAttr("threshold") + ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold")) + : 20.0f; + + nvinfer1::ILayer* layer = nullptr; + if (engine_->with_dynamic_shape()) { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPluginDynamic* plugin = + new plugin::MishPluginDynamic(threshold, with_fp16); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); + } else { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPlugin* plugin = new plugin::MishPlugin(threshold, with_fp16); + layer = engine_->AddPlugin(&input, input_num, plugin); + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "mish", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(mish, MishOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 1898f28c73ad0..733a8f64ae5db 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -107,6 +107,9 @@ class Pool2dOpConverter : public OpConverter { plugin_pool_type = plugin::PoolPlugin::PoolType::avg; } + if (padding_algorithm == "VALID") { + std::fill(paddings.begin(), paddings.end(), 0); + } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); nvinfer1::DimsHW nv_strides(strides[0], strides[1]); nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); @@ -123,6 +126,30 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + pool_layer->setAverageCountExcludesPadding(exclusive); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + layer = pool_layer; + } else if (!adaptive && !global_pooling && ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + // If ceil mode is true, we will pad the appropriate size to the input. + DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, + input_dims); + auto *pad_layer = TRT_ENGINE_ADD_LAYER( + engine_, Padding, *const_cast(input1), pre_pad, + post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc new file mode 100644 index 0000000000000..c84c30255fa96 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(mish_op, test_mish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mish-X", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("mish-Out", nvinfer1::Dims3(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mish"); + desc.SetInput("X", {"mish-X"}); + desc.SetOutput("Out", {"mish-Out"}); + + desc.SetAttr("threshold", 20.0f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(mish); diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc index 2d12eaf736b75..17d217dff43fd 100644 --- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -48,13 +48,20 @@ class YoloBoxOpConverter : public OpConverter { float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + bool iou_aware = op_desc.HasAttr("iou_aware") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("iou_aware")) + : false; + float iou_aware_factor = + op_desc.HasAttr("iou_aware_factor") + ? BOOST_GET_CONST(float, op_desc.GetAttr("iou_aware_factor")) + : 0.5; int type_id = static_cast(engine_->WithFp16()); auto input_dim = X_tensor->getDimensions(); auto* yolo_box_plugin = new plugin::YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, - input_dim.d[1], input_dim.d[2]); + iou_aware, iou_aware_factor, input_dim.d[1], input_dim.d[2]); std::vector yolo_box_inputs; yolo_box_inputs.push_back(X_tensor); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 5bfd2f1277795..ef50aee48e2eb 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -48,9 +48,11 @@ struct SimpleOpTypeSetTeller : public Teller { int8_teller_set.insert("skip_layernorm"); int8_teller_set.insert("slice"); #endif -#if IS_TRT_VERSION_GE(7130) - teller_set.insert("group_norm"); -#endif +// TODO(baoachun) The group_norm trt plugin will check input's dim +// not -1 failed when dynamic shape mode. +// #if IS_TRT_VERSION_GE(7130) +// teller_set.insert("group_norm"); +// #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); #endif @@ -134,7 +136,8 @@ struct SimpleOpTypeSetTeller : public Teller { "reduce_sum", "reduce_mean", "conv3d", - "conv3d_transpose"}; + "conv3d_transpose", + "mish"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -171,22 +174,8 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (op_type == "pool2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (paddings.size() > 2) return false; - if (desc.HasAttr("exclusive")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { - std::vector ksize = - BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); - for (size_t i = 0; i < ksize.size(); i++) { - if (ksize[i] <= paddings[i]) { - VLOG(3) << "the padding size should be less than the filter size " - "for exclusive-counting pooling."; - return false; - } - } - } - } - if (desc.HasAttr("ceil_mode")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false; + if (paddings.size() > 2) { + return false; } if (desc.Input("X").size() != 1) { VLOG(3) << "TRT Pool2d expect 1 input, but got " @@ -208,15 +197,32 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << pool_type << " pool type."; return false; } + if (pool_type == "avg") { + if (desc.HasAttr("global_pooling")) { + if (!BOOST_GET_CONST(bool, desc.GetAttr("global_pooling"))) { + if (desc.HasAttr("exclusive")) { + if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { + std::vector ksize = + BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); + for (size_t i = 0; i < ksize.size(); i++) { + if (ksize[i] <= paddings[i]) { + VLOG(3) << "the padding size should be less than the " + "filter size " + "for exclusive-counting pooling."; + return false; + } + } + } + } + } + } + } } } if (op_type == "conv2d" || op_type == "conv2d_transpose" || op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || op_type == "depthwise_conv2d_transpose") { - std::vector paddings = - BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (desc.Input("Input").size() != 1) { VLOG(3) << "TRT Conv2d expect 1 input, but got " << desc.Input("Input").size() << " input."; @@ -1046,6 +1052,44 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "mish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of mish TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of mish TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 1) { + VLOG(3) << "mish op does not support input's dim is 1 in tensorrt."; + return false; + } + + if (!with_dynamic_shape) { + if (x_shape.size() == 2) { + VLOG(3) << "mish op does not support input's dim is 2 in tensorrt."; + return false; + } + } + } + if (op_type == "roi_align") { if (!with_dynamic_shape) { VLOG(3) << "TRT roi align plugin only accept the dynamic shape, " diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 311c2312a9f45..e6bcb59fd092c 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -9,6 +9,7 @@ nv_library(tensorrt_plugin yolo_box_op_plugin.cu roi_align_op_plugin.cu gather_nd_op_plugin.cu + mish_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 69e0075729b0d..d6a1cdb9e68a6 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -65,12 +65,6 @@ nvinfer1::Dims ElementWisePlugin::getOutputDimensions( } int ElementWisePlugin::initialize() TRT_NOEXCEPT { - PADDLE_ENFORCE_GT(dims_y_.nbDims, 0, - platform::errors::InvalidArgument( - "The dimension of input Y of TRT elementwise op plugin " - "should be greater than 0, but got %d.", - dims_y_.nbDims)); - axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; int trimed_nb_dims = dims_y_.nbDims; for (; trimed_nb_dims > 0; --trimed_nb_dims) { diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu new file mode 100644 index 0000000000000..6e268e7b0b330 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +int MishPlugin::initialize() TRT_NOEXCEPT { return 0; } + +bool MishPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT { + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } +} + +nvinfer1::Dims MishPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* in_dims, + int nb_inputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument( + "We expect [number of inputs] == 1" + "in TRT Mish op plugin, but got " + "[number of inputs] = %d.", + nb_inputs)); + PADDLE_ENFORCE_LT(index, this->getNbOutputs(), + platform::errors::InvalidArgument( + "We expect [index] < [number of outputs]" + "in TRT Mish op plugin, but got " + "[index] = %d, [number of outputs] = %d.", + index, this->getNbOutputs())); + nvinfer1::Dims const& input_dims = in_dims[0]; + nvinfer1::Dims output_dims = input_dims; + return output_dims; +} + +template +__device__ T kTanh(T x) { + return tanh(x); +} + +template <> +__device__ half kTanh(half x) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const float tmp = tanhf(__half2float(x)); + return __float2half(tmp); +#endif +} + +template +__device__ T kSoftplus(T x, T threshold) { + return x > threshold ? x : log(exp(x) + static_cast(1.0f)); +} + +template <> +__device__ half kSoftplus(half x, half threshold) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + return x > threshold ? x : hlog(hexp(x) + static_cast(1.0f)); +#endif +} + +template +__global__ void mish_kernel(float threshold, int n, const T* input, T* output) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const T in = input[idx]; + output[idx] = in * kTanh(kSoftplus(in, static_cast(threshold))); + } +} + +template <> +__global__ void mish_kernel(float threshold, int n, const half* input, + half* output) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const half in = input[idx]; + output[idx] = + in * kTanh(kSoftplus(in, static_cast(threshold))); + } +#endif +} + +#if IS_TRT_VERSION_LT(8000) +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, +#else +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { + const auto& input_dims = this->getInputDims(0); + int num = batchSize; + for (int i = 0; i < input_dims.nbDims; i++) { + num *= input_dims.d[i]; + } + + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto type = getDataType(); + if (type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + + return cudaGetLastError() != cudaSuccess; +} + +// Dynamic Plugin below. +int MishPluginDynamic::initialize() TRT_NOEXCEPT { + getPluginNamespace(); + return 0; +} + +size_t MishPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { + return SerializedSize(threshold_) + SerializedSize(with_fp16_); +} + +void MishPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT { + SerializeValue(&buffer, threshold_); + SerializeValue(&buffer, with_fp16_); +} + +nvinfer1::DimsExprs MishPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { + return inputs[0]; +} + +bool MishPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of mish plugin shoule not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + + const nvinfer1::PluginTensorDesc& in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } + const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; + // output + return in.type == prev.type && in.format == prev.format; +} + +nvinfer1::DataType MishPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The Mish Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); + return input_types[0]; +} + +int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + auto input_dims = input_desc[0].dims; + size_t num = ProductDim(input_dims); + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto input_type = input_desc[0].type; + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h new file mode 100644 index 0000000000000..75390666ea097 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h @@ -0,0 +1,175 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class MishPlugin : public PluginTensorRT { + private: + float threshold_; + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { + return getBaseSerializationSize() + SerializedSize(threshold_); + } + + // TRT will call this func to serialize the configuration of TRT + // It should not be called by users. + void serialize(void* buffer) const TRT_NOEXCEPT override { + serializeBase(buffer); + SerializeValue(&buffer, threshold_); + } + + public: + explicit MishPlugin(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + MishPlugin(void const* serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &threshold_); + } + + ~MishPlugin() {} + MishPlugin* clone() const TRT_NOEXCEPT override { + return new MishPlugin(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) + const TRT_NOEXCEPT override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT override; +#if IS_TRT_VERSION_LT(8000) + int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; +}; + +class MishPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + return new MishPlugin(serial_data, serial_length); + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginCreator); + +class MishPluginDynamic : public DynamicPluginTensorRT { + public: + explicit MishPluginDynamic(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + MishPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &threshold_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); + } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + return new MishPluginDynamic(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + + size_t getSerializationSize() const TRT_NOEXCEPT override; + void serialize(void* buffer) const TRT_NOEXCEPT override; + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + float threshold_; +}; + +class MishPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + auto plugin = new MishPluginDynamic(serial_data, serial_length); + return plugin; + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginDynamicCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index ee1709f57e259..57177cfa8b421 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include #include #include @@ -29,7 +27,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, - const float scale_x_y, const int input_h, + const float scale_x_y, const bool iou_aware, + const float iou_aware_factor, const int input_h, const int input_w) : data_type_(data_type), class_num_(class_num), @@ -37,6 +36,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, downsample_ratio_(downsample_ratio), clip_bbox_(clip_bbox), scale_x_y_(scale_x_y), + iou_aware_(iou_aware), + iou_aware_factor_(iou_aware_factor), input_h_(input_h), input_w_(input_w) { anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); @@ -45,6 +46,7 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, assert(class_num_ > 0); assert(input_h_ > 0); assert(input_w_ > 0); + assert((iou_aware_factor_ > 0 && iou_aware_factor_ < 1)); cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), @@ -59,6 +61,8 @@ YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { DeserializeValue(&data, &length, &downsample_ratio_); DeserializeValue(&data, &length, &clip_bbox_); DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &iou_aware_); + DeserializeValue(&data, &length, &iou_aware_factor_); DeserializeValue(&data, &length, &input_h_); DeserializeValue(&data, &length, &input_w_); } @@ -119,10 +123,10 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, int img_height, int img_width, float scale, float bias) { box[0] = static_cast( - (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + (i + sigmoid(static_cast(x[index])) * scale + bias) * img_width / grid_size_w); box[1] = static_cast( - (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + (j + sigmoid(static_cast(x[index + stride])) * scale + bias) * img_height / grid_size_h); box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * anchors[2 * an_idx] * img_width / input_size_w); @@ -133,8 +137,19 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, __device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride, - int entry) { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + int entry, bool iou_aware) { + if (iou_aware) { + return (batch * an_num + an_idx) * an_stride + + (batch * an_num + an_num + entry) * stride + hw_idx; + } else { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + } +} + +__device__ inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride) { + return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + + hw_idx; } template @@ -178,7 +193,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, const int w, const int an_num, const int class_num, const int box_num, int input_size_h, int input_size_w, bool clip_bbox, const float scale, - const float bias) { + const float bias, bool iou_aware, + const float iou_aware_factor) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; float box[4]; @@ -193,11 +209,16 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, int img_height = imgsize[2 * i]; int img_width = imgsize[2 * i + 1]; - int obj_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, + iou_aware); float conf = sigmoid(static_cast(input[obj_idx])); - int box_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + if (iou_aware) { + int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); + float iou = sigmoid(input[iou_idx]); + conf = powf(conf, 1. - iou_aware_factor) * powf(iou, iou_aware_factor); + } + int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, + iou_aware); if (conf < conf_thresh) { for (int i = 0; i < 4; ++i) { @@ -212,8 +233,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, box_idx = (i * box_num + j * grid_num + k * w + l) * 4; CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - int label_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, + 5, iou_aware); int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, grid_num); @@ -240,7 +261,8 @@ int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, reinterpret_cast(inputs[1]), reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, - input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias, iou_aware_, + iou_aware_factor_); return cudaGetLastError() != cudaSuccess; } @@ -274,6 +296,8 @@ size_t YoloBoxPlugin::getSerializationSize() const TRT_NOEXCEPT { serialize_size += SerializedSize(scale_x_y_); serialize_size += SerializedSize(input_h_); serialize_size += SerializedSize(input_w_); + serialize_size += SerializedSize(iou_aware_); + serialize_size += SerializedSize(iou_aware_factor_); return serialize_size; } @@ -285,6 +309,8 @@ void YoloBoxPlugin::serialize(void* buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, downsample_ratio_); SerializeValue(&buffer, clip_bbox_); SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, iou_aware_); + SerializeValue(&buffer, iou_aware_factor_); SerializeValue(&buffer, input_h_); SerializeValue(&buffer, input_w_); } @@ -326,8 +352,8 @@ void YoloBoxPlugin::configurePlugin( nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const TRT_NOEXCEPT { return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, - downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, - input_w_); + downsample_ratio_, clip_bbox_, scale_x_y_, + iou_aware_, iou_aware_factor_, input_h_, input_w_); } YoloBoxPluginCreator::YoloBoxPluginCreator() {} @@ -367,6 +393,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( float scale_x_y = 1.; int h = -1; int w = -1; + bool iou_aware = false; + float iou_aware_factor = 0.5; for (int i = 0; i < fc->nbFields; ++i) { const std::string field_name(fc->fields[i].name); @@ -386,6 +414,10 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( clip_bbox = *static_cast(fc->fields[i].data); } else if (field_name.compare("scale_x_y")) { scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware")) { + iou_aware = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware_factor")) { + iou_aware_factor = *static_cast(fc->fields[i].data); } else if (field_name.compare("h")) { h = *static_cast(fc->fields[i].data); } else if (field_name.compare("w")) { @@ -397,7 +429,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( return new YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, - class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, iou_aware, + iou_aware_factor, h, w); } nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h index c9e9f9a0567ae..ae9a6739cedd3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -31,6 +31,7 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, const float scale_x_y, + const bool iou_aware, const float iou_aware_factor, const int input_h, const int input_w); YoloBoxPlugin(const void* data, size_t length); ~YoloBoxPlugin() override; @@ -89,6 +90,8 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { float scale_x_y_; int input_h_; int input_w_; + bool iou_aware_; + float iou_aware_factor_; std::string namespace_; }; diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6b4afae9f8c75..4aa1900f53f5e 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -82,7 +82,11 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy ) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) + +if (WITH_GPU) + target_link_libraries(allocator_facade cuda_graph) +endif() cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 1d89918bfebf6..f0b7f1a4b0d9e 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// For memory address alignment class AlignedAllocation : public Allocation { public: AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 78bce53b6f4ff..281902f3a2b12 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -23,6 +23,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -32,6 +33,9 @@ #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph.h" +#endif #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu/xpu_info.h" #endif @@ -47,17 +51,64 @@ PADDLE_DEFINE_EXPORTED_bool( "Whether to use system allocator to allocate CPU and GPU memory. " "Only used for unittests."); +DECLARE_string(allocator_strategy); + namespace paddle { namespace memory { namespace allocation { +#ifdef PADDLE_WITH_CUDA +class CUDAGraphAllocator + : public Allocator, + public std::enable_shared_from_this { + private: + class PrivateAllocation : public Allocation { + public: + PrivateAllocation(CUDAGraphAllocator* allocator, + AllocationPtr underlying_allocation) + : Allocation(underlying_allocation->ptr(), + underlying_allocation->size(), + underlying_allocation->place()), + allocator_(allocator->shared_from_this()), + underlying_allocation_(std::move(underlying_allocation)) {} + + private: + std::shared_ptr allocator_; + AllocationPtr underlying_allocation_; + }; + + explicit CUDAGraphAllocator(const std::shared_ptr& allocator) + : underlying_allocator_(allocator) {} + + public: + static std::shared_ptr Create( + const std::shared_ptr& allocator) { + return std::shared_ptr(new CUDAGraphAllocator(allocator)); + } + + protected: + Allocation* AllocateImpl(size_t size) { + VLOG(10) << "Allocate " << size << " for CUDA Graph"; + return new PrivateAllocation(this, underlying_allocator_->Allocate(size)); + } + + void FreeImpl(Allocation* allocation) { + VLOG(10) << "delete for CUDA Graph"; + delete allocation; + } + + private: + std::shared_ptr underlying_allocator_; +}; +#endif + class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; - AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { + explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { + strategy_ = GetAllocatorStrategy(); + switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); #ifdef PADDLE_WITH_XPU @@ -91,7 +142,8 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id)); + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk); } InitNaiveBestFitCUDAPinnedAllocator(); #endif @@ -117,7 +169,7 @@ class AllocatorFacadePrivate { default: { PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy))); + "Unsupported allocator strategy: %d", static_cast(strategy_))); } } InitZeroSizeAllocators(); @@ -130,11 +182,31 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); } + inline const AllocatorMap& GetAllocatorMap() { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_allocator_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + return iter->second->allocators_; + } else { + return allocators_; + } +#else + return allocators_; +#endif + } + inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { + VLOG(4) << "GetAllocator" + << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ - : allocators_) + : GetAllocatorMap()) : zero_size_allocators_); auto iter = allocators.find(place); PADDLE_ENFORCE_NE(iter, allocators.end(), @@ -145,6 +217,7 @@ class AllocatorFacadePrivate { private: void InitSystemAllocators() { + if (!system_allocators_.empty()) return; system_allocators_[platform::CPUPlace()] = std::make_shared(); #ifdef PADDLE_WITH_XPU int device_count = platform::GetXPUDeviceCount(); @@ -183,10 +256,42 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared(p); } - void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) { + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, + bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize()); + underlying_allocator, alignment, 0, allow_free_idle_chunk); } #endif @@ -226,6 +331,7 @@ class AllocatorFacadePrivate { }; void InitZeroSizeAllocators() { + if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -279,12 +385,57 @@ class AllocatorFacadePrivate { } } +#ifdef PADDLE_WITH_CUDA + + public: + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_allocator_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + for (auto& item : allocator->allocators_) { + auto& old_allocator = item.second; + old_allocator = CUDAGraphAllocator::Create(old_allocator); + } + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + } + + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_allocator_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; + } +#endif + private: AllocatorMap allocators_; - AllocatorMap zero_size_allocators_; - AllocatorMap system_allocators_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_allocator_map_; +#endif + AllocatorStrategy strategy_; + + static AllocatorMap zero_size_allocators_; + static AllocatorMap system_allocators_; }; +AllocatorFacadePrivate::AllocatorMap + AllocatorFacadePrivate::zero_size_allocators_; +AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; + // Pimpl. Make interface clean. AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} // delete m_ may cause core dump when the destructor of python in conflict with @@ -316,6 +467,16 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +#ifdef PADDLE_WITH_CUDA +void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + return m_->PrepareMemoryPoolForCUDAGraph(id); +} + +void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + return m_->RemoveMemoryPoolOfCUDAGraph(id); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 7f6ad561aa931..8d889ec38eed7 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -18,6 +18,9 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif #include "paddle/fluid/platform/place.h" namespace paddle { @@ -54,6 +57,11 @@ class AllocatorFacade { uint64_t Release(const platform::Place& place); const std::shared_ptr& GetAllocator(const platform::Place& place); +#ifdef PADDLE_WITH_CUDA + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); +#endif + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index a35d8a73f7eda..9f34f5198a179 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -39,14 +39,15 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size) - : underlying_allocator_( - std::make_shared(underlying_allocator, alignment)), + size_t chunk_size, bool allow_free_idle_chunk) + : underlying_allocator_(underlying_allocator), alignment_(alignment), - chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {} + chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), + allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { - size = AlignedSize(size, alignment_); +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { + size_t size = AlignedSize(unaligned_size, alignment_); + VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -56,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { free_blocks_.erase(iter); auto *chunk = block_it->chunk_; size_t remaining_size = block_it->size_ - size; + VLOG(10) << "Allocate " << size << " bytes from chunk size " + << block_it->size_ << ", remaining " << remaining_size; if (remaining_size == 0) { block_it->is_free_ = false; } else { @@ -94,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { } blocks.emplace_back(p + remaining_size, size, false, chunk); block_it = --(blocks.end()); - VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " - << remaining_size; + VLOG(2) << "Not found and reallocate " << realloc_size << "(" + << static_cast(p) << "), and remaining " << remaining_size; } return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() << " bytes"; std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; @@ -139,6 +143,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { } uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() { + if (!allow_free_idle_chunk_) { + return 0; + } uint64_t bytes = 0; for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) { auto &blocks = chunk_it->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index 5ed6eb94f158f..d1fa6cce0164f 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator { public: AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size = 0); + size_t chunk_size = 0, bool allow_free_idle_chunk = true); bool IsAllocThreadSafe() const override { return true; } @@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator { std::list chunks_; size_t alignment_; size_t chunk_size_; + bool allow_free_idle_chunk_; SpinLock spinlock_; }; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 6f2591c8b15c8..926af8292d2e8 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" - #include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" + #include "gtest/gtest.h" DECLARE_bool(free_idle_chunk); @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; auto recorded_allocator = std::make_shared(); + size_t alignment = 4096; size_t memory_size = 8192; + auto underlying_allocator = + std::make_shared(recorded_allocator, alignment); auto ag_allocator = std::make_shared( - recorded_allocator, alignment); + underlying_allocator, alignment); for (size_t i = 0; i < 10; ++i) { auto allocation = ag_allocator->Allocate(memory_size); @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { auto underlying_allocator = std::make_shared(memory_capacity); + auto aligned_allocator = + std::make_shared(underlying_allocator, alignment); auto ag_allocator = std::make_shared( - underlying_allocator, alignment); + aligned_allocator, alignment); ag_allocator->Allocate(allocate_size[0]); ASSERT_EQ(underlying_allocator->AllocatedSize(), diff --git a/paddle/fluid/memory/allocation/spin_lock.h b/paddle/fluid/memory/allocation/spin_lock.h index 42462fd74b4cd..2bbe340e7c691 100644 --- a/paddle/fluid/memory/allocation/spin_lock.h +++ b/paddle/fluid/memory/allocation/spin_lock.h @@ -15,37 +15,48 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 +#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(__i386__) +#define __PADDLE_x86__ +#include +#endif +#include #include "paddle/fluid/platform/macros.h" namespace paddle { namespace memory { +static inline void CpuRelax() { +#if defined(__PADDLE_x86__) + _mm_pause(); +#endif +} class SpinLock { public: SpinLock() : mlock_(false) {} void lock() { - bool expect = false; - uint64_t spin_cnt = 0; - while (!mlock_.compare_exchange_weak(expect, true)) { - expect = false; - if ((++spin_cnt & 0xFF) == 0) { -#if defined(_WIN32) - SleepEx(50, FALSE); -#else - sched_yield(); -#endif + for (;;) { + if (!mlock_.exchange(true, std::memory_order_acquire)) { + break; + } + constexpr int kMaxLoop = 32; + for (int loop = 1; mlock_.load(std::memory_order_relaxed);) { + if (loop <= kMaxLoop) { + for (int i = 1; i <= loop; ++i) { + CpuRelax(); + } + loop *= 2; + } else { + std::this_thread::yield(); + } } } } - void unlock() { mlock_.store(false); } + void unlock() { mlock_.store(false, std::memory_order_release); } + DISABLE_COPY_AND_ASSIGN(SpinLock); private: diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 0d7d0a5e13bf3..b910b4ec73901 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -78,7 +78,7 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op sparse_attention_op lstm_op run_program_op eye_op recurrent_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) @@ -94,6 +94,10 @@ if (WITH_GPU OR WITH_ROCM) endif() op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) + op_library(sparse_attention_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n") + endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h index b19ba1e1590fe..2c34d6f8300a7 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h @@ -89,22 +89,25 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, const int64_t n) { auto cu_stream = ctx.stream(); auto ComputeBlockSize = [](int64_t col) { + auto block_size = 8; if (col > 512) - return 1024; + block_size = 1024; else if (col > 256) - return 512; + block_size = 512; else if (col > 128) - return 256; + block_size = 256; else if (col > 64) - return 128; + block_size = 128; else if (col > 32) - return 64; + block_size = 64; else if (col > 16) - return 32; + block_size = 32; else if (col > 8) - return 16; - else - return 8; + block_size = 16; +#ifdef __HIPCC__ + block_size = std::min(block_size, 256); +#endif + return block_size; }; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index dfb620a4e96bd..791c3656791da 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -38,11 +38,13 @@ class NPUBatchNormOpKernel : public framework::OpKernel { const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ( + (x_dims.size() == 4UL || x_dims.size() == 3UL), true, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 3 or 4. " + " But got X's shape = [%s], X's dimension = [%d].", + x_dims.to_str(), x_dims.size())); + const auto *running_mean = ctx.Input("Mean"); const auto *running_var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); @@ -51,8 +53,11 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - Tensor x_tensor(x->type()); - Tensor y_tesnor(y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto y_tesnor = + ctx.AllocateTmpTensor(y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); y_tesnor.ShareDataWith(*y); if (data_layout == DataLayout::kNHWC) { @@ -89,6 +94,18 @@ class NPUBatchNormOpKernel : public framework::OpKernel { sum.mutable_data(running_mean->dims(), ctx.GetPlace()); square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); + // BNTrainingReduce ONLY support rank = 4 + if (x->dims().size() == 3) { + auto x_shape_vec = framework::vectorize(x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + x_tensor.Resize(x_new_shape); + x_tensor.Resize(x_new_shape); + } const auto &runner_reduce = NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum}, {{"epsilon", epsilon}}); @@ -127,8 +144,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { use_global_stats = is_test || use_global_stats; - Tensor x_tensor(x->type()); - Tensor dy_tensor(d_y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto dy_tensor = + ctx.AllocateTmpTensor(d_y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); dy_tensor.ShareDataWith(*d_y); if (data_layout == DataLayout::kNHWC) { @@ -136,14 +156,14 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { dy_tensor.set_layout(DataLayout::kNHWC); } - Tensor scale_grad_tmp(scale->type()); - Tensor bias_grad_tmp(bias->type()); + auto scale_grad_tmp = + ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto bias_grad_tmp = + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_scale == nullptr) { - scale_grad_tmp.Resize(scale->dims()); d_scale = &scale_grad_tmp; } if (d_bias == nullptr) { - bias_grad_tmp.Resize(bias->dims()); d_bias = &bias_grad_tmp; } @@ -169,9 +189,23 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { } if (d_x) { d_x->mutable_data(ctx.GetPlace()); - Tensor dx_tensor(d_x->type()); + auto dx_tensor = + ctx.AllocateTmpTensor(d_x->dims(), dev_ctx); dx_tensor.ShareDataWith(*d_x); if (use_global_stats) { + if (x->dims().size() == 3) { + // BNInferGrad only support x rank = 4, + auto x_shape_vec = framework::vectorize(d_x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, + 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + dx_tensor.Resize(x_new_shape); + dy_tensor.Resize(x_new_shape); + } const auto *running_var = ctx.Input("Variance"); const auto &runner_infer = NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var}, diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 06300817e0a12..05a110fe65b83 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -47,12 +47,12 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) { } template -struct CastOpFunctor { +struct CastCUDAOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; const platform::CUDADeviceContext& ctx_; - CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::CUDADeviceContext& ctx) + CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const platform::CUDADeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -75,41 +75,38 @@ struct CastOpFunctor { } }; +template +class CastCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast( + context.Attr("out_dtype")), + CastCUDAOpFunctor( + in, out, + context.template device_context())); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +#define REGISTER_CAST_CUDA_BASE(op_name, ...) \ + REGISTER_OP_CUDA_KERNEL( \ + op_name, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel>, \ + ops::CastCUDAOpKernel>, ##__VA_ARGS__); -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); +#if !defined(PADDLE_WITH_HIP) +REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel) #else -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); +REGISTER_CAST_CUDA_BASE(cast) #endif diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index a400d27b798e3..e6b1f6a1c18c3 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); + auto input_data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + // extra checking if attr "use_mkldnn" exist is needed because + // test_reverse_op is calling concat_grad kernel without setting + // "use_mkldnn" to any value + if (ctx.HasAttr("use_mkldnn") && + this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4c0ef02074e2e..f4183bf570926 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cudnn_desc.h" namespace paddle { namespace operators { @@ -480,6 +481,7 @@ struct SearchAlgorithm { static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, const framework::ExecutionContext& ctx) { + platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -601,6 +603,7 @@ struct SearchAlgorithm { } static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 86724e06975ed..47de843d1ac6f 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -186,11 +186,6 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel { dilations[3] = dilation[1]; } - // LOG(INFO) << "strides = " << framework::make_ddim(strides).to_str(); - // LOG(INFO) << "dilations = " << framework::make_ddim(dilations).to_str(); - // LOG(INFO) << "padding = " << framework::make_ddim(padding).to_str(); - // LOG(INFO) << "data_format = " << data_format; - if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index c04d04f841388..4e951f6318cc9 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -15,8 +15,13 @@ function(detection_library TARGET_NAME) PARENT_SCOPE) endfunction() +if (WITH_ASCEND_CL) + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) +else() + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) +endif() + detection_library(bipartite_match_op SRCS bipartite_match_op.cc) -detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc new file mode 100644 index 0000000000000..9d97c7af9630c --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_coder_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct BoxCoderFunction { + public: + explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + Tensor Adds(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Muls(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Mul(const Tensor& x, const Tensor& y) { + Tensor z; + z.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + Tensor SubWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + z.mutable_data(shape, place); + const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + void DivWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor DivWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + DivWithBroadCastVoid(x, y, shape, &z); + return z; + } + void MulWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor MulWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + MulWithBroadCastVoid(x, y, shape, &z); + return z; + } + void AddWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor AddWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + AddWithBroadCastVoid(x, y, shape, &z); + return z; + } + Tensor Abs(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Abs", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Log(const Tensor& x) { + Tensor t_x_m1 = Adds(x, -1); + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Exp(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Exp", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Dot(const Tensor& x, const Tensor& y) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + PADDLE_ENFORCE_EQ( + dim_x.size(), 2, + platform::errors::InvalidArgument( + "x should be a 2-dim tensor, but got %d-dim.", dim_x.size())); + PADDLE_ENFORCE_EQ( + dim_y.size(), 2, + platform::errors::InvalidArgument( + "y should be a 2-dim tensor, but got %d-dim.", dim_y.size())); + PADDLE_ENFORCE_EQ( + dim_x[1], dim_y[0], + platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but " + "got dim_x[1] = %d, dim_y[0] = %d.", + dim_x[1], dim_y[0])); + Tensor z; + z.mutable_data({dim_x[0], dim_y[1]}, place); + const auto& runner = + NpuOpRunner("MatMul", {x, y}, {z}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + runner.Run(stream); + return z; + } + void ConcatVoid(const std::vector& inputs, + const framework::DDim& shape_out, int axis, Tensor* output) { + output->mutable_data(shape_out, place); + std::vector names; + for (size_t i = 0; i < inputs.size(); i++) { + names.push_back("x" + std::to_string(i)); + } + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*output}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; + runner.AddInputNames(names); + runner.Run(stream); + } + Tensor Concat(const std::vector& inputs, + const framework::DDim& shape_out, int axis) { + Tensor output; + ConcatVoid(inputs, shape_out, axis, &output); + return output; + } + Tensor Slice(const Tensor& x, const std::vector& offsets, + const std::vector& size, const framework::DDim& shape) { + Tensor y; + y.mutable_data(shape, place); + const auto& runner = + NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}}); + runner.Run(stream); + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +void Vector2Tensor(const framework::ExecutionContext& ctx, + const std::vector& vec, const framework::DDim& ddim, + Tensor* tsr) { + framework::TensorFromVector(vec, ctx.device_context(), tsr); + ctx.template device_context().Wait(); + tsr->Resize(ddim); +} + +template +void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, Tensor* out) { + auto M = pb->dims()[0]; + auto N = tb->dims()[0]; + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + Tensor tb_xy = F.Dot(*tb, m_aver); + Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1)); + + pb_xy.Resize({1, M, 2}); + pb_wh.Resize({1, M, 2}); + tb_xy.Resize({N, 1, 2}); + tb_wh.Resize({N, 1, 2}); + + auto shape_half = framework::make_ddim({N, M, 2}); + auto shape_full = framework::make_ddim({N, M, 4}); + + Tensor out_xy_0 = F.DivWithBroadCast( + F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half); + Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half))); + Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2); + + if (pbv) { + F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out); + } else { + Tensor t_var; + std::vector vec_var(4); + for (auto i = 0; i < 4; i++) { + vec_var[i] = static_cast(variance[i]); + } + Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var); + F.DivWithBroadCastVoid(out_0, t_var, shape_full, out); + } +} + +template +void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, int axis, Tensor* out) { + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + auto pb_resize_shape = axis == 0 + ? framework::make_ddim({1, pb->dims()[0], 2}) + : framework::make_ddim({pb->dims()[0], 1, 2}); + pb_xy.Resize(pb_resize_shape); + pb_wh.Resize(pb_resize_shape); + + auto tbox_slice_shape = + framework::make_ddim({tb->dims()[0], tb->dims()[1], 2}); + std::vector tbox_slice_size = {static_cast(tb->dims()[0]), + static_cast(tb->dims()[1]), 2}; + Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape); + Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape); + + Tensor tb_xy; + Tensor tb_wh; + if (pbv) { + auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2}); + auto pbvt_resize_shape = axis == 0 + ? framework::make_ddim({1, pbv->dims()[0], 2}) + : framework::make_ddim({pbv->dims()[0], 1, 2}); + std::vector pbvt_slice_size = {static_cast(pbv->dims()[0]), 2}; + Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape); + Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape); + pbv_t01.Resize(pbvt_resize_shape); + pbv_t23.Resize(pbvt_resize_shape); + + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } else if (variance.empty()) { + F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh); + } else { + Tensor t_var01, t_var23; + auto t_var_shape = framework::make_ddim({1, 1, 2}); + std::vector vec_var01 = {static_cast(variance[0]), + static_cast(variance[1])}; + std::vector vec_var23 = {static_cast(variance[2]), + static_cast(variance[3])}; + Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01); + Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23); + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, + F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape), + tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } + Tensor obox01 = + F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape); + Tensor obox23 = + F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape), + (norm ? 0 : -1)); + F.ConcatVoid({obox01, obox23}, out->dims(), 2, out); +} + +template +class BoxCoderNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* prior_box = ctx.Input("PriorBox"); + auto* prior_box_var = ctx.Input("PriorBoxVar"); + auto* target_box = ctx.Input("TargetBox"); + auto* output_box = ctx.Output("OutputBox"); + std::vector variance = ctx.Attr>("variance"); + const int axis = ctx.Attr("axis"); + + if (prior_box_var) { + PADDLE_ENFORCE_EQ(variance.empty(), true, + platform::errors::InvalidArgument( + "Input 'PriorBoxVar' and attribute 'variance'" + " of BoxCoder operator should not be used at the " + "same time.")); + } + if (!(variance.empty())) { + PADDLE_ENFORCE_EQ(static_cast(variance.size()), 4, + platform::errors::InvalidArgument( + "Size of attribute 'variance' in BoxCoder operator" + " should be 4. But received size is %d", + variance.size())); + } + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + platform::errors::InvalidArgument( + "Input 'TargetBox' of BoxCoder operator only" + " supports LoD with one level.")); + } + + auto code_type = GetBoxCodeType(ctx.Attr("code_type")); + bool normalized = ctx.Attr("box_normalized"); + + if (code_type == BoxCodeType::kEncodeCenterSize) { + BoxCoderEnc(ctx, target_box, prior_box, prior_box_var, normalized, + variance, output_box); + } else { + BoxCoderDec(ctx, target_box, prior_box, prior_box_var, normalized, + variance, axis, output_box); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel, + ops::BoxCoderNPUKernel); diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 4261a5f2534c8..695d29b294a51 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/gpu_launch_config.h" @@ -196,28 +197,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, config.thread_per_block.x * vec_size) + 1) * vec_size; - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if ((seed) && platform::is_gpu_place(seed->place())) { - framework::Tensor seed_cpu_tensor; - TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); - seed_data = static_cast(seed_cpu_tensor.data()[0]); - increment = offset; - } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { - auto seed_offset = gen_cuda->IncrementOffset(offset); - seed_data = seed_offset.first; - increment = seed_offset.second; - } else { - if (seed) { - seed_data = *(seed->data()); - } else { - std::random_device rnd; - seed_data = is_fix_seed ? seed_val : rnd(); - } - increment = offset; - } + + GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, + &seed_data, &increment); #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h new file mode 100644 index 0000000000000..a7188efe7139c --- /dev/null +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* seed, + const bool is_fix_seed, const int seed_val, + const int offset, uint64_t* seed_data, + uint64_t* increment) { + int device_id = + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); + auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); + + if ((seed) && platform::is_gpu_place(seed->place())) { + framework::Tensor seed_cpu_tensor; + TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); + *seed_data = static_cast(seed_cpu_tensor.data()[0]); + *increment = offset; + } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { + auto seed_offset = gen_cuda->IncrementOffset(offset); + *seed_data = seed_offset.first; + *increment = seed_offset.second; + } else { + if (seed) { + *seed_data = *(seed->data()); + } else { + std::random_device rnd; + *seed_data = is_fix_seed ? seed_val : rnd(); + } + *increment = offset; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 9700b9a2f7a1c..cbfb795d6a23e 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel { return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "Seed") { + VLOG(10) << "var_name:" << var_name + << " does not need to transform in dropout op"; + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc new file mode 100644 index 0000000000000..c1aac4546e36e --- /dev/null +++ b/paddle/fluid/operators/eig_op.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/eig_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class EigOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", + "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", + "Eig"); + + auto x_dims = ctx->GetInputDim("X"); + int rank = x_dims.size(); + PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument( + "Expects input tensor x to be not less than " + "2 dimentions, but got dimention %d", + rank)); + PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1], + platform::errors::InvalidArgument( + "The input matrix must be a square matrix, " + "but receive a matrix with %d rows and %d colums", + x_dims[rank - 2], x_dims[rank - 1])); + + std::vector batch_dims_vec{}; + for (int i = 0; i < rank - 1; ++i) { + batch_dims_vec.emplace_back(x_dims[i]); + } + + ctx->SetOutputDim("Eigenvectors", x_dims); + ctx->SetOutputDim("Eigenvalues", framework::make_ddim(batch_dims_vec)); + } + + protected: + // The output of eig is always complex-valued even for real-valued inputs + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + if (dtype != framework::proto::VarType::FP32 && + dtype != framework::proto::VarType::FP64 && + dtype != framework::proto::VarType::COMPLEX64 && + dtype != framework::proto::VarType::COMPLEX128) { + PADDLE_THROW(platform::errors::InvalidArgument( + "unsupported data type: %s!", dtype)); + } + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +class EigOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor), A complex-valued or real-valued tensor with shape (*, " + "n, n). The accepted datatype is one of float32, float64, complex64 " + "or complex128"); + AddOutput("Eigenvalues", + "(Tensor), The output eigenvalues tensor with shape (*, n). The " + "datatype is complex64 or complex128"); + AddOutput("Eigenvectors", + "(Tensor), The output eigenvectors tensor with shape (*, n, n). " + "The datatype is complex64 or complex128"); + + AddComment(R"DOC( + Eig Operator. + +This API processes eigen decomposition for general square matrices. + +)DOC"); + } +}; + +class EigGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Eigenvalues"), "Input", "Eigenvalues", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")), + "Input", "Eigenvalues@GRAD", "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvectors")), + "Input", "Eigenvectors@GRAD", "EigGrad"); + + auto dims = ctx->GetInputDim("Eigenvectors"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Eigenvectors")), + ctx.device_context()); + } +}; + +template +class EigGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Eigenvalues", this->Output("Eigenvalues")); + op->SetInput("Eigenvectors", this->Output("Eigenvectors")); + op->SetInput(framework::GradVarName("Eigenvalues"), + this->OutputGrad("Eigenvalues")); + op->SetInput(framework::GradVarName("Eigenvectors"), + this->OutputGrad("Eigenvectors")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +using complex64 = paddle::platform::complex; +using complex128 = paddle::platform::complex; + +namespace ops = paddle::operators; +REGISTER_OPERATOR(eig, ops::EigOp, ops::EigOpMaker, + ops::EigGradOpMaker, + ops::EigGradOpMaker); + +REGISTER_OPERATOR(eig_grad, ops::EigGradOp); + +REGISTER_OP_CPU_KERNEL( + eig, ops::EigKernel, + ops::EigKernel, + ops::EigKernel, + ops::EigKernel); + +REGISTER_OP_CPU_KERNEL( + eig_grad, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h new file mode 100644 index 0000000000000..b9a3cb300b4c2 --- /dev/null +++ b/paddle/fluid/operators/eig_op.h @@ -0,0 +1,330 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/operators/math/lapack_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matrix_solve.h" +#include "paddle/fluid/operators/svd_helper.h" +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/platform/for_range.h" +#define EPSILON 1e-6 + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +inline int BatchCount(const Tensor& matrix) { + int count = 1; + int num_dims = matrix.dims().size(); + for (int i = 0; i < num_dims - 2; ++i) { + count *= matrix.dims()[i]; + } + return count; +} + +inline int MatrixStride(const Tensor& matrix) { + framework::DDim dims_list = matrix.dims(); + int num_dims = dims_list.size(); + return dims_list[num_dims - 1] * dims_list[num_dims - 2]; +} + +// Transpose two axis of a Tensor +template +void TransposeTwoAxis(const Tensor& input, Tensor* transposed_input, + const int axis1, const int axis2, + const framework::ExecutionContext& context) { + std::vector permute(input.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis1] = axis2; + permute[axis2] = axis1; + + transposed_input->mutable_data(input.dims(), context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + TransCompute(input.dims().size(), dev_ctx, input, + transposed_input, permute); +} + +// Apply eig to a batch of matrices, values, vectors and (intermidiate +// tensor) info are overritten +template +void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, + const framework::ExecutionContext& context) { + char jobvl = 'N'; + char jobvr = 'V'; // only right eigenvectors are computed + int num_dims = input->dims().size(); + int order = input->dims()[num_dims - 1]; + + T* input_data = input->data(); + int lda = std::max(1, order); + T* values_data = values->mutable_data(context.GetPlace()); + T* lvector_data = nullptr; + int ldvl = 1; + T* rvector_data = vectors->mutable_data(context.GetPlace()); + int ldvr = lda; + int lwork = -1; + + int batch_count = BatchCount(*input); + int matrix_stride = MatrixStride(*input); + int values_stride = values->dims()[values->dims().size() - 1]; + + Tensor rwork; + math::Real* rwork_data = nullptr; + + rwork.Resize(framework::make_ddim({lda * 2})); + rwork_data = rwork.mutable_data>(context.GetPlace()); + + // call lapackEig once to compute the size of work; + T computed_work_size; + math::lapackEig>( + jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, + rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); + + lwork = std::max(1, static_cast(math::Real(computed_work_size))); + Tensor work; + work.Resize(framework::make_ddim({lwork})); + T* work_data = work.mutable_data(context.GetPlace()); + + for (auto i = 0; i < batch_count; ++i) { + T* current_matrix = &input_data[i * matrix_stride]; + T* current_values = &values_data[i * values_stride]; + T* current_rvectors = &rvector_data[i * matrix_stride]; + + math::lapackEig>( + jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, + ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); + PADDLE_ENFORCE_EQ( + info, 0, + platform::errors::PreconditionNotMet( + "current info is not 0, computation failed. " + "= 0: successful exit." + "< 0: if INFO = -i, the i-th argument had an illegal value." + "> 0: if INFO = i, the QR algorithm failed to compute all the " + "eigenvalues, and no eigenvectors have been computed; " + "elements i+1:N of WR and WI contain eigenvalues which " + "have converged.")); + } +} + +template +void ApplyEigKernel(const Tensor& input, Tensor* values, Tensor* vectors, + const framework::ExecutionContext& context) { + Tensor input_column_major; + Tensor vectors_row_major; + int num_dims = input.dims().size(); + + // transfer to column-major memory layout i.e. make_ddim from tranposed_input: + // [batch,row,col]->[batch,col,row] + TransposeTwoAxis(input, &input_column_major, num_dims - 1, + num_dims - 2, context); + // make sure 'vectors_row_major' holds memory before passed to LapackEig() + vectors_row_major.Resize(input.dims()); + int info = 0; + LapackEig(&input_column_major, values, &vectors_row_major, info, context); + + // transfer column-major layout back + // vectors_row_major: column-major layout + // vector: original layout + TransposeTwoAxis(vectors_row_major, vectors, num_dims - 1, + num_dims - 2, context); +} + +template +void ConstructComplexVectors(Tensor* c_vectors, const Tensor& c_values, + const Tensor& r_vectors, + const framework::ExecutionContext& ctx, + int batch_count, int order) { + int matrix_stride = MatrixStride(r_vectors); + + auto* c_vectors_data = c_vectors->mutable_data(ctx.GetPlace()); + auto* c_values_data = c_values.data(); + auto* r_v_data = r_vectors.data(); + + for (int b = 0; b < batch_count; b++) { + auto* vecs = &r_v_data[b * matrix_stride]; + auto* res = &c_vectors_data[b * matrix_stride]; + auto* vals = &c_values_data[b * order]; + + for (int j = 0; j < order; j++) { + if (vals[j].imag < EPSILON) { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], 0); + } + } else { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], + vecs[(j + 1) * order + i]); + res[(j + 1) * order + i] = platform::complex( + vecs[j * order + i], -vecs[(j + 1) * order + i]); + } + j++; + } + } + } +} + +template +class EigKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out_values = context.Output("Eigenvalues"); + auto* out_vectors = context.Output("Eigenvectors"); + + if (!framework::IsComplexType(x->type())) { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + int batch_count = BatchCount(*x); + int order = x->dims()[x->dims().size() - 1]; + + Tensor real_values; + Tensor real_vectors; + // double the size of real_values, the first half stores the real part, + // the next half stores the imag part + std::vector origin_dim = + framework::vectorize(out_values->dims()); + int last_item = origin_dim.back(); + origin_dim.pop_back(); + origin_dim.push_back(last_item * 2); + framework::DDim big_dim = framework::make_ddim(origin_dim); + + real_values.mutable_data>(big_dim, context.GetPlace()); + real_vectors.mutable_data>(x->dims(), context.GetPlace()); + + ApplyEigKernel>(*x, &real_values, + &real_vectors, context); + auto dito = + math::DeviceIndependenceTensorOperations, + Tout>(context); + + // 1. extract real part & imag part from real_values + Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); + Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); + int out_values_numel = out_values->numel(); + platform::ForRange for_range( + context.template device_context(), out_values_numel); + math::RealImagToComplexFunctor functor( + real_part_data, imag_part_data, + out_values->mutable_data(context.GetPlace()), out_values_numel); + for_range(functor); + + // 3. construct complex vectors + Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor out_vectors_trans; + out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); + ConstructComplexVectors, Tout>( + &out_vectors_trans, *out_values, real_vector_trans, context, + batch_count, order); + TransposeTwoAxis(out_vectors_trans, out_vectors, + x->dims().size() - 1, + x->dims().size() - 2, context); + } else { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + ApplyEigKernel(*x, out_values, out_vectors, context); + } + } +}; + +template +void ComputeBackwardForComplexInput( + const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, + Tout* x_grad_data, int batch_count, int order, + const framework::ExecutionContext& context) { + auto dito = + math::DeviceIndependenceTensorOperations( + context); + + Tensor trans_v = dito.Transpose(V); + Tensor Vh = dito.Conj(trans_v); + Tensor Lconj = dito.Conj(L); + Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); + Tensor VhgV = dito.Matmul(Vh, gV); + Tensor diag_real = dito.Real(VhgV); + Tensor diag_res = dito.BatchDiag(diag_real, batch_count); + Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + + // turn diag_unsqueezed into complex + auto numel = diag_unsqueezed.numel(); + Tensor diag_unsqueezed_complex; + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + diag_unsqueezed.dims(), context.GetPlace(), + static_cast(numel * sizeof(Tout))); + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + math::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); + for_range(functor); + // real tensor multiply complex tensor in broadcast manner + Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); + Tensor res2 = dito.Matmul(Vh, res1); + Tensor result = dito.Sub(VhgV, res2); + + result.mutable_data(V.dims(), context.GetPlace()); + result = dito.Div(result, Econj); + result = dito.DiagFill(order, order, order, 0, gL, result); + Tensor rhs = dito.Matmul(result, Vh); + + // solve linear system + // solve(Vh, rhs, out, m, k) + // Vh: matrix with shape [m,m] + // rhs: rhs with shape [m,k] + // x_grad: out + int m = Vh.dims()[Vh.dims().size() - 1]; + int k = rhs.dims()[rhs.dims().size() - 1]; + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); +} + +template +class EigGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& L = *context.Input("Eigenvalues"); + auto& V = *context.Input("Eigenvectors"); + auto& gL = *context.Input(framework::GradVarName("Eigenvalues")); + auto& gV = *context.Input(framework::GradVarName("Eigenvectors")); + + auto& x_grad = *context.Output(framework::GradVarName("X")); + auto* x_grad_data = x_grad.mutable_data(context.GetPlace()); + + auto& dims = V.dims(); + framework::DDim dim_origin = dims; + int num_dims = dim_origin.size(); + int batch_count = BatchCount(V); + const int order = dim_origin[num_dims - 1]; + + ComputeBackwardForComplexInput( + V, L, gL, gV, x_grad_data, batch_count, order, context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc index 47aa7e2521f76..b2030ad21e8d1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL -#include -#include - #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; - -template +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const int axis, + const framework::DDim& ddims, + const framework::DDim& brd_ddims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t brd_size = brd_ddims.size(); + int64_t org_size = ddims.size(); + // int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < brd_size; ++i) { + if (i < axis || i >= org_size + axis) { + axes.push_back(i); + continue; + } + if (brd_ddims[i] > ddims[i - axis]) { + axes.push_back(i); + } + } + // LOG(INFO) << "axes = " << framework::make_ddim(axes).to_str(); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class ElementwiseMulNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + + bool direct_compute = false; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + if (x_dims.size() >= y_dims.size()) { + direct_compute = x_dims.size() == (y_dims.size() + axis); + } else { + direct_compute = y_dims.size() == (x_dims.size() + axis); + } - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); - runner.Run(stream); + if (direct_compute) { + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); + runner.Run(stream); + } else { + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); + const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {}); + runner.Run(stream); + } } }; -template +template class ElementwiseMulGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); - auto place = ctx.GetPlace(); + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + auto stream = ctx.template device_context().stream(); - auto stream = - ctx.template device_context() - .stream(); + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); if (dx) { - dx->mutable_data(place); - const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); - runner_dx.Run(stream); + if (dx->dims() == dout->dims()) { + dx->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {}); + runner_dx.Run(stream); + } else { + Tensor dx_temp(x->type()); + dx_temp.Resize(trans_x.dims()); + dx_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dx = + NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {}); + runner_dx.Run(stream); + ReduceDims(ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, + dx); + } } - if (dy) { - dy->mutable_data(place); - const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); - runner_dy.Run(stream); + if (dy->dims() == dout->dims()) { + dy->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {}); + runner_dy.Run(stream); + } else { + Tensor dy_temp(y->type()); + dy_temp.Resize(trans_y.dims()); + dy_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dy = + NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {}); + runner_dy.Run(stream); + ReduceDims(ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, + dy); + } } } }; @@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - elementwise_mul, - ops::ElementwiseMulNPUKernel, - ops::ElementwiseMulNPUKernel); +REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel, + ops::ElementwiseMulNPUKernel); REGISTER_OP_NPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradNPUKernel, - ops::ElementwiseMulGradNPUKernel); -#endif + elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel, + ops::ElementwiseMulGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index 94e78defbbee5..48b98dafc7bb5 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -166,9 +166,11 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, +REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, + ops::ElementwiseSubNPUKernel, ops::ElementwiseSubNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, ops::ElementwiseSubGradNPUKernel, ops::ElementwiseSubGradNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc index d5204f5cacfc6..566b265bfdba9 100644 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -63,9 +63,12 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { .stream(); auto shape = out->dims(); - const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out}, - {{"dims", framework::vectorize(shape)}}); - runner.Run(stream); + NpuOpRunner runner; + runner.SetType("Fill") + .AddInput(framework::vectorize(shape)) + .AddInput(tensor_tmp) + .AddOutput(*out) + .Run(stream); } }; @@ -75,5 +78,8 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::FillAnyLikeNPUKernel, +#endif ops::FillAnyLikeNPUKernel, ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc index db55c3e99693a..be3239d504844 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cc +++ b/paddle/fluid/operators/fill_diagonal_op.cc @@ -108,8 +108,15 @@ class FillIDiagonalKernel : public framework::OpKernel { size = std::min(size, out_dims[1] * out_dims[1]); } - for (int64_t i = offset; i < size; i += strides) { - out_data[i] = temp_var; + for (int64_t i = 0; i < size; i += strides) { + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if (i % out_dims[1] + offset >= 0 && + i % out_dims[1] + offset < out_dims[1]) { + out_data[i + offset] = temp_var; + } } } }; @@ -176,8 +183,11 @@ class FillIDiagonalGradKernel : public framework::OpKernel { wrapsize = size; } - for (int64_t i = offset; i < wrapsize; i += strides) { - data[i] = T(0); + for (int64_t i = 0; i < wrapsize; i += strides) { + if (i % dx_dims[1] + offset >= 0 && + i % dx_dims[1] + offset < dx_dims[1]) { + data[i + offset] = T(0); + } } } } diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu index 5047059fb364d..15eabd4216d0b 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cu +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -22,11 +22,19 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_constant_kernel(const int64_t featuresize, T* in_data, - int64_t strides, int offset, T fillvar) { + int64_t strides, int offset, T fillvar, + int dims) { for (int64_t idx = blockIdx.x * featuresize + threadIdx.x; idx * strides + offset < (blockIdx.x + 1) * featuresize; idx += blockDim.x) { - in_data[idx * strides + offset] = fillvar; + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if ((idx * strides) % dims + offset < dims && + (idx * strides) % dims + offset >= 0) { + in_data[idx * strides + offset] = fillvar; + } } } @@ -62,7 +70,7 @@ class FillIDiagonalCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(size, out_data, strides, - offset, temp_var); + offset, temp_var, out_dims[1]); } }; @@ -96,7 +104,7 @@ class FillIDiagonalGradCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(wrapsize, in_data, strides, - offset, T(0)); + offset, T(0), out_dims[1]); } }; diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 0858a43838b96..14f2e9061b742 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -77,9 +77,17 @@ class FlattenOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -101,6 +109,14 @@ class FlattenOpMaker : public framework::OpProtoAndCheckerMaker { "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the" "input tensor is (d_0, d_1, ... d_n).") .SetDefault(1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment(R"DOC( Flatten Operator @@ -139,9 +155,17 @@ class FlattenGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -198,6 +222,21 @@ class Flatten2Op : public framework::OperatorWithKernel { ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", "XShape"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class Flatten2OpMaker : public FlattenOpMaker { @@ -244,9 +283,17 @@ class Flatten2GradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 599be6912b760..2630c12db2fc9 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -80,5 +80,6 @@ if (WITH_GPU OR WITH_ROCM) endif() if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) + cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() endif() diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc new file mode 100644 index 0000000000000..837bca6c2cf4e --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -0,0 +1,814 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" +#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/float16.h" + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace op = paddle::operators; +using Tensor = paddle::framework::Tensor; + +USE_OP(batch_norm); +USE_CUDA_ONLY_OP(fused_bn_add_activation); +USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + std::default_random_engine random(0); + std::uniform_real_distribution dis(-1.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} + +template +void InitConstantTensor(const std::vector &dims, T value, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = value; + } +} + +template +void CheckOutput(std::string name, const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + if (cpu_res.dims().size() == cpu_base.dims().size()) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + } else { + EXPECT_EQ(cpu_res.numel(), cpu_base.numel()); + } + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + float max_diff = 0; + int index = 0; + for (int i = 0; i < cpu_res.numel(); ++i) { + float cur_diff; + if (is_relative_atol) { + cur_diff = static_cast( + std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + cur_diff = static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + if (cur_diff > max_diff) { + max_diff = cur_diff; + index = i; + } + } + std::string error_type = is_relative_atol ? "relative" : "absolute"; + LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims() + << "], maximum " << error_type << " error is " << max_diff << ": " + << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; +} + +template +void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + // x is in NHWC format. + auto dims = cpu_x.dims(); + int64_t c = dims[3]; + + const T *cpu_x_ptr = cpu_x.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_x.numel() / c; ++i) { + float tmp_x = static_cast(cpu_x_ptr[i * c + j]); + tmp_sum += tmp_x; + tmp_sum_of_squares += tmp_x * tmp_x; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; + } +} + +template +void ComputeInplaceAdd(const framework::Tensor &cpu_x, + framework::Tensor *cpu_y) { + EXPECT_EQ(cpu_x.dims(), cpu_y->dims()); + + const T *cpu_x_ptr = cpu_x.data(); + T *cpu_y_ptr = cpu_y->data(); + for (int64_t i = 0; i < cpu_x.numel(); ++i) { + cpu_y_ptr[i] += cpu_x_ptr[i]; + } +} + +template +void ComputeInplaceRelu(framework::Tensor *cpu_x) { + T *cpu_x_ptr = cpu_x->data(); + for (int64_t i = 0; i < cpu_x->numel(); ++i) { + cpu_x_ptr[i] = + cpu_x_ptr[i] > static_cast(0) ? cpu_x_ptr[i] : static_cast(0); + } +} + +void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + std::string data_layout = "NHWC"; + attrs.insert({"data_layout", data_layout}); + + auto op = framework::OpRegistry::CreateOp( + "batch_norm", {{"X", {"X"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"Mean", {"Mean"}}, + {"Variance", {"Variance"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_z, + const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *z = scope.Var("Z")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_z, place, z); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation", + {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluBackward( + const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy, + const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, + const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var, + const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *dy = scope.Var("Y@GRAD")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + auto *dx = scope.Var("X@GRAD")->GetMutable(); + auto *dz = scope.Var("Z@GRAD")->GetMutable(); + auto *dscale = scope.Var("Scale@GRAD")->GetMutable(); + auto *dbias = scope.Var("Bias@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_y, place, y); + TensorCopySync(cpu_dy, place, dy); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(cpu_saved_mean, place, saved_mean); + TensorCopySync(cpu_saved_var, place, saved_var); + reserve_space->ShareDataWith(saved_reserve_space); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + saved_mean->Resize({channels}); + saved_var->Resize({channels}); + + framework::AttributeMap attrs; + float momentum = 0.9; + float epsilon = 1e-5; + std::string act_type = "relu"; + attrs.insert({"momentum", momentum}); + attrs.insert({"epsilon", epsilon}); + attrs.insert({"act_type", act_type}); + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation_grad", {{"X", {"X"}}, + {"Y", {"Y"}}, + {"Y@GRAD", {"Y@GRAD"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + {{"X@GRAD", {"X@GRAD"}}, + {"Z@GRAD", {"Z@GRAD"}}, + {"Scale@GRAD", {"Scale@GRAD"}}, + {"Bias@GRAD", {"Bias@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); +} + +template +class CudnnBNAddReluTester { + public: + CudnnBNAddReluTester(int batch_size, int height, int width, int channels, + std::string act_type, bool fuse_add, bool has_shortcut) { + batch_size_ = batch_size; + height_ = height; + width_ = width; + channels_ = channels; + ele_count_ = batch_size_ * height_ * width_; + act_type_ = act_type; + fuse_add_ = fuse_add; + has_shortcut_ = has_shortcut; + SetUp(); + } + + ~CudnnBNAddReluTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + LOG(INFO) << "[CheckForward, diff=" << diff + << ", is_relative_atol=" << is_relative_atol + << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ + << ", has_shortcut=" << has_shortcut_; + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; + + framework::Tensor cpu_mean_base_x; + framework::Tensor cpu_var_base_x; + framework::Tensor cpu_mean_base_z; + framework::Tensor cpu_var_base_z; + if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) { + BaselineForwardFusedBNAddRelu( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_); + } else { + BaselineForward( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_, + select(&cpu_mean_base_z), select(&cpu_var_base_z), + select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_), + select(&saved_reserve_space_z_)); + } + + framework::Tensor cpu_mean_x; + framework::Tensor cpu_var_x; + framework::Tensor cpu_y; + framework::Tensor cpu_mean_z; + framework::Tensor cpu_var_z; + FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_, + &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z), + select(&cpu_var_z), select(&cpu_saved_mean_z_), + select(&cpu_saved_var_z_)); + + CheckOutput("Mean", cpu_mean_x, cpu_mean_base_x, diff, + is_relative_atol); + CheckOutput("Variance", cpu_var_x, cpu_var_base_x, diff, + is_relative_atol); + CheckOutput("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_, + diff, is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_, + diff, is_relative_atol); + if (has_shortcut_) { + CheckOutput("MeanZ", cpu_mean_z, cpu_mean_base_z, diff, + is_relative_atol); + CheckOutput("VarianceZ", cpu_var_z, cpu_var_base_z, diff, + is_relative_atol); + CheckOutput("SavedMeanZ", cpu_saved_mean_z_, + cpu_saved_mean_base_z_, diff, is_relative_atol); + CheckOutput("SavedVarianceZ", cpu_saved_var_z_, + cpu_saved_var_base_z_, diff, is_relative_atol); + } + CheckOutput("Y", cpu_y, cpu_y_base_, diff, is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_dx_base; + framework::Tensor cpu_dz_base; + framework::Tensor cpu_dscale_base; + framework::Tensor cpu_dbias_base; + BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base, + &cpu_dscale_base, &cpu_dbias_base); + + framework::Tensor cpu_dx; + framework::Tensor cpu_dz; + framework::Tensor cpu_dscale; + framework::Tensor cpu_dbias; + FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias); + + CheckOutput("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol); + CheckOutput("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol); + CheckOutput("DScale", cpu_dscale, cpu_dscale_base, diff, + is_relative_atol); + CheckOutput("DBias", cpu_dbias, cpu_dbias_base, diff, + is_relative_atol); + } + + private: + void SetUp() { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); + InitRandomTensor({channels_}, &cpu_bn_scale_x_); + InitRandomTensor({channels_}, &cpu_bn_bias_x_); + + if (has_shortcut_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + InitRandomTensor({channels_}, &cpu_bn_scale_z_); + InitRandomTensor({channels_}, &cpu_bn_bias_z_); + } else { + if (fuse_add_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + } + } + + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_dy_); + } + + void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var) { + InitConstantTensor({channels_}, static_cast(0.0f), cpu_mean); + InitConstantTensor({channels_}, static_cast(1.0f), cpu_var); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_mean); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_var); + } + + void BaselineForward(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean_x, Tensor *cpu_var_x, + Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x, + Tensor *cpu_y, Tensor *saved_reserve_space_x, + Tensor *cpu_mean_z = nullptr, + Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr, + Tensor *saved_reserve_space_z = nullptr) { + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_mean_x, cpu_var_x, cpu_saved_mean_x, + cpu_saved_var_x, cpu_y, saved_reserve_space_x); + if (has_shortcut_) { + framework::Tensor cpu_z_out; + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + ComputeBatchNormForward( + ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z, + cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z); + ComputeInplaceAdd(cpu_z_out, cpu_y); + } else { + if (fuse_add_) { + ComputeInplaceAdd(cpu_z_, cpu_y); + } + } + if (act_type_ == "relu") { + ComputeInplaceRelu(cpu_y); + } + } + + void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean, Tensor *cpu_var, + Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); + ComputeFusedBNAddReluForward( + ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var, + cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space); + } + + void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_dx, Tensor *cpu_dz, + Tensor *cpu_dscale, Tensor *cpu_dbias) { + ComputeFusedBNAddReluBackward( + ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_, + saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias); + } + + void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, + const Tensor &cpu_bn_scale, + const Tensor &cpu_bn_bias, Tensor *sum, + Tensor *sum_of_square, Tensor *bn_scale, + Tensor *bn_bias, Tensor *mean, Tensor *var, + Tensor *saved_mean, Tensor *saved_var, + Tensor *equiv_scale, Tensor *equiv_bias) { + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_sum, place, sum); + TensorCopySync(cpu_sum_of_square, place, sum_of_square); + TensorCopySync(cpu_bn_scale, place, bn_scale); + TensorCopySync(cpu_bn_bias, place, bn_bias); + + bn_scale->Resize({1, 1, 1, channels_}); + bn_bias->Resize({1, 1, 1, channels_}); + + // input + float *sum_ptr = sum->data(); + float *sum_of_square_ptr = sum_of_square->data(); + float *bn_scale_ptr = bn_scale->data(); + float *bn_bias_ptr = bn_bias->data(); + + mean->Resize({1, 1, 1, channels_}); + var->Resize({1, 1, 1, channels_}); + + // output + float *mean_ptr = mean->data(); + float *var_ptr = var->data(); + float *saved_mean_ptr = + saved_mean->mutable_data({1, 1, 1, channels_}, place); + float *saved_var_ptr = + saved_var->mutable_data({1, 1, 1, channels_}, place); + T *equiv_scale_ptr = + equiv_scale->mutable_data({1, 1, 1, channels_}, place); + T *equiv_bias_ptr = + equiv_bias->mutable_data({1, 1, 1, channels_}, place); + + auto param_shape = framework::vectorize(bn_scale->dims()); + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, sum_ptr, sum_of_square_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, mean_ptr, var_ptr, + equiv_scale_ptr, equiv_bias_ptr, eps_, momentum_, ele_count_, + true); + } + + // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x, + Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, + Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask, + Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr) { + framework::Tensor x; + framework::Tensor sum_x; + framework::Tensor sum_of_square_x; + framework::Tensor bn_scale_x; + framework::Tensor bn_bias_x; + + framework::Tensor z; + framework::Tensor sum_z; + framework::Tensor sum_of_square_z; + framework::Tensor bn_scale_z; + framework::Tensor bn_bias_z; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x_, place, &x); + if (fuse_add_ || has_shortcut_) { + TensorCopySync(cpu_z_, place, &z); + } + + framework::Tensor mean_x; + framework::Tensor var_x; + framework::Tensor saved_mean_x; + framework::Tensor saved_var_x; + framework::Tensor equiv_scale_x; + framework::Tensor equiv_bias_x; + + framework::Tensor mean_z; + framework::Tensor var_z; + framework::Tensor saved_mean_z; + framework::Tensor saved_var_z; + framework::Tensor equiv_scale_z; + framework::Tensor equiv_bias_z; + + framework::Tensor y; + framework::Tensor bitmask; + + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + TensorCopySync(*cpu_mean_x, place, &mean_x); + TensorCopySync(*cpu_var_x, place, &var_x); + if (has_shortcut_) { + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + TensorCopySync(*cpu_mean_z, place, &mean_z); + TensorCopySync(*cpu_var_z, place, &var_z); + } + + // 1. BN Stats Finalize + ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + &sum_x, &sum_of_square_x, &bn_scale_x, + &bn_bias_x, &mean_x, &var_x, &saved_mean_x, + &saved_var_x, &equiv_scale_x, &equiv_bias_x); + if (has_shortcut_) { + ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, + &sum_z, &sum_of_square_z, &bn_scale_z, + &bn_bias_z, &mean_z, &var_z, &saved_mean_z, + &saved_var_z, &equiv_scale_z, &equiv_bias_z); + } + + T *x_ptr = x.data(); + T *z_ptr = (fuse_add_ || has_shortcut_) ? z.data() : nullptr; + T *equiv_scale_x_ptr = equiv_scale_x.data(); + T *equiv_bias_x_ptr = equiv_bias_x.data(); + T *equiv_scale_z_ptr = has_shortcut_ ? equiv_scale_z.data() : nullptr; + T *equiv_bias_z_ptr = has_shortcut_ ? equiv_bias_z.data() : nullptr; + T *y_ptr = + y.mutable_data({batch_size_, height_, width_, channels_}, place); + + int c = channels_; + int64_t nhw = ele_count_; + int32_t c_int32_elems = ((c + 63) & ~63) / 32; + int32_t nhw_int32_elems = (nhw + 31) & ~31; + int32_t *bitmask_ptr = bitmask.mutable_data( + {nhw_int32_elems, c_int32_elems, 1}, place); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale_x.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + // 2. Scale Bias + Relu + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, + has_shortcut_, data_shape, param_shape, + bitmask_shape); + sbar_op.Forward(ctx, x_ptr, equiv_scale_x_ptr, equiv_bias_x_ptr, y_ptr, + bitmask_ptr, z_ptr, equiv_scale_z_ptr, equiv_bias_z_ptr); + + TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); + TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); + TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); + TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); + if (has_shortcut_) { + TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); + TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); + TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); + TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); + } + TensorCopySync(y, platform::CPUPlace(), cpu_y); + TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); + } + + // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Tensor dy; + framework::Tensor x; + framework::Tensor bn_scale; + framework::Tensor bn_bias; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor bitmask; + framework::Tensor dx; + framework::Tensor dz; + framework::Tensor dscale; + framework::Tensor dbias; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_dy_, place, &dy); + TensorCopySync(cpu_x_, place, &x); + TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); + TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); + TensorCopySync(cpu_saved_var_x_, place, &saved_var); + TensorCopySync(cpu_bitmask_, place, &bitmask); + + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + saved_mean.Resize({1, 1, 1, channels_}); + saved_var.Resize({1, 1, 1, channels_}); + + T *dy_ptr = dy.data(); + T *x_ptr = x.data(); + float *bn_scale_ptr = bn_scale.data(); + float *bn_bias_ptr = bn_bias.data(); + float *saved_mean_ptr = saved_mean.data(); + float *saved_var_ptr = saved_var.data(); + int32_t *bitmask_ptr = bitmask.data(); + T *dx_ptr = + dx.mutable_data({batch_size_, height_, width_, channels_}, place); + T *dz_ptr = + dz.mutable_data({batch_size_, height_, width_, channels_}, place); + float *dscale_ptr = dscale.mutable_data({1, 1, 1, channels_}, place); + float *dbias_ptr = dbias.mutable_data({1, 1, 1, channels_}, place); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + std::string act_type = "relu"; + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, + param_shape, bitmask_shape); + sbar_op.Backward(ctx, dy_ptr, x_ptr, bn_scale_ptr, bn_bias_ptr, + saved_mean_ptr, saved_var_ptr, bitmask_ptr, dx_ptr, dz_ptr, + dscale_ptr, dbias_ptr, eps_); + + TensorCopySync(dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); + } + + private: + int batch_size_; + int height_; + int width_; + int channels_; + int ele_count_; + + std::string act_type_; + bool fuse_add_; + bool has_shortcut_; + + // Forward input + framework::Tensor cpu_x_; + framework::Tensor cpu_bn_scale_x_; + framework::Tensor cpu_bn_bias_x_; + framework::Tensor cpu_z_; + framework::Tensor cpu_bn_scale_z_; + framework::Tensor cpu_bn_bias_z_; + + // Backward input + framework::Tensor cpu_dy_; + framework::Tensor cpu_bitmask_; + framework::Tensor cpu_saved_mean_x_; + framework::Tensor cpu_saved_var_x_; + framework::Tensor cpu_saved_mean_z_; + framework::Tensor cpu_saved_var_z_; + framework::Tensor cpu_saved_mean_base_x_; + framework::Tensor cpu_saved_var_base_x_; + framework::Tensor saved_reserve_space_x_; + framework::Tensor cpu_saved_mean_base_z_; + framework::Tensor cpu_saved_var_base_z_; + framework::Tensor saved_reserve_space_z_; + framework::Tensor cpu_y_base_; + + double eps_ = 1e-5; + float momentum_ = 0.9; +}; + +TEST(CudnnBNAddReluFp16, BNAdd) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + } +} + +TEST(CudnnBNAddReluFp16, BNAddRelu) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = "relu"; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + if (fuse_add) { + test.CheckBackward(2e-4); + } + } +} + +TEST(CudnnBNAddReluFp16, HasShortcut) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool fuse_add = false; + bool has_shortcut = true; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(5e-3); +} diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h new file mode 100644 index 0000000000000..7d4b24cd4fc3d --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct BNStatsFinalizeArgs { + BNStatsFinalizeArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::vector ¶m_shape) { + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + + in_desc.set(param_shape, format, param_dtype); + out_desc.set(param_shape, format, dtype); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; +}; + +template +class CudnnBNStatsFinalize { + public: + CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const std::vector ¶m_shape) + : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING), + inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) { + args_.Set(param_shape); + } + ~CudnnBNStatsFinalize() {} + + void Forward(const platform::CUDADeviceContext &ctx, float *sum_ptr, + float *sum_of_squares_ptr, float *scale_ptr, float *bias_ptr, + float *saved_mean_ptr, float *saved_invstd_ptr, + float *running_mean_ptr, float *running_var_ptr, + T *equiv_scale_ptr, T *equiv_bias_ptr, double eps, + float momentum, int64_t ele_count, bool is_train) { + if (is_train) { + TrainInit(ctx); + } else { + InferenceInit(ctx); + } + auto &op = is_train ? train_op_ : inference_op_; + + // Set variant_param for both inference_op_ and train_op_ + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_VAR, running_var_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, equiv_scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, equiv_bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps); + + // Set extra variant_param only for train_op_: + if (is_train) { + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, saved_invstd_ptr); + double avg_factor = 1.0 - momentum; + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT, + &ele_count); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR, + &avg_factor); + } + // fused op execute + auto handle = ctx.cudnn_handle(); + op.Execute(handle); + } + + private: + void TrainInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for train op + train_op_.SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER, + CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for train op + train_op_.SetOpConstParamDesc( + {CUDNN_PARAM_YSTATS_DESC, CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC}, + args_.in_desc.desc()); + train_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + train_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + void InferenceInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for inference op + inference_op_.SetOpConstParamAttr( + {CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for inference op + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.in_desc.desc()); + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + inference_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + BNStatsFinalizeArgs args_; + CudnnFusionOp train_op_; + CudnnFusionOp inference_op_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 4434681e60b3b..fcd354df938ac 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include #include -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" @@ -41,12 +39,9 @@ class CudnnFusionOp { } ~CudnnFusionOp() { - // New 'fused op' descriptor destruction - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_)); - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnDestroyFusedOpsPlan(op_)); + dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_); + dynload::cudnnDestroyFusedOpsConstParamPack(op_const_params_); + dynload::cudnnDestroyFusedOpsPlan(op_); } // Execute fused op @@ -121,41 +116,49 @@ class CudnnFusionOp { // Get the workspace, which is required before Execute(). size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { - size_t workspace_bytes = 0U; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( - cudnn_handle, op_, op_const_params_, &workspace_bytes)); - plan_created_ = true; - return workspace_bytes; + if (!plan_created_) { + workspace_bytes_ = 0U; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( + cudnn_handle, op_, op_const_params_, &workspace_bytes_)); + plan_created_ = true; + } + return workspace_bytes_; } private: bool plan_created_; + size_t workspace_bytes_; cudnnFusedOpsPlan_t op_; cudnnFusedOpsConstParamPack_t op_const_params_; cudnnFusedOpsVariantParamPack_t op_variant_params_; }; -static inline std::vector GetStrides(const std::vector &shape) { - if (shape.size() < 1) { - return {}; +class CudnnFusionOpCache { + public: + static CudnnFusionOpCache &Instance() { + static CudnnFusionOpCache instance; + return instance; + } + + framework::AlgorithmsCache *GetForward() { + return &forward_cache_; } - int dim = static_cast(shape.size()); - std::vector pro_shape(shape); - std::vector strides(dim); - int temp = pro_shape[1]; - pro_shape.erase(pro_shape.begin() + 1); - pro_shape.push_back(temp); - strides.back() = 1; - for (int i = dim - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * pro_shape[i + 1]; + framework::AlgorithmsCache *GetBackward() { + return &backward_cache_; } - strides.pop_back(); - strides.insert(strides.begin() + 1, 1); - return strides; -} -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + private: + CudnnFusionOpCache() {} + ~CudnnFusionOpCache() { + // Need to delete the memory of cache. + } + CudnnFusionOpCache(const CudnnFusionOpCache &) {} + + private: + framework::AlgorithmsCache forward_cache_; + framework::AlgorithmsCache backward_cache_; +}; #endif // CUDNN_VERSION >= 8000 } // namespace operators diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1ead78b8b64e1..1a73281cb8dc6 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -15,125 +15,320 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; namespace dynload = platform::dynload; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + #if CUDNN_VERSION >= 8000 + +static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; } + template -class CudnnNormConvolutionOp { +struct NormConvolutionArgs { + NormConvolutionArgs() { + dtype = platform::CudnnDataType::type; + format = CUDNN_TENSOR_NHWC; + compute_type = platform::CudnnDataType::type; + } + + void Set(const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, int padding, int stride, + int dilation, int group) { + PADDLE_ENFORCE_EQ( + input_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of input_shape is expected to 4. But recieved " + "input_shape's size is %d, input_shape is [%s].", + input_shape.size(), framework::make_ddim(input_shape))); + PADDLE_ENFORCE_EQ( + filter_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of filter_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + filter_shape.size(), framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] && + (filter_shape[1] == 1 || filter_shape[1] == 3), + true, + platform::errors::InvalidArgument( + "The filter_shape is expected to store as nhwc, and " + "h = w = 1 or 3. But recieved filter_shape is [%s].", + framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ( + output_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of output_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + output_shape.size(), framework::make_ddim(output_shape))); + + for (size_t i = 0; i < input_shape.size(); ++i) { + in_dims.push_back(input_shape[i]); + } + for (size_t i = 0; i < filter_shape.size(); ++i) { + filter_dims.push_back(filter_shape[i]); + } + paddings = {padding, padding}; + strides = {stride, stride}; + dilations = {dilation, dilation}; + + in_desc.set(input_shape, format, dtype); + filter_desc.set(filter_shape, format, dtype, group); + out_desc.set(output_shape, format, dtype); + + int output_channel = filter_shape[0]; + std::vector stats_shape = {1, 1, 1, output_channel}; + out_stats_desc.set(stats_shape, format, compute_type); + + conv_desc.set(dtype, paddings, strides, dilations, false, group); + } + + cudnnDataType_t dtype; + cudnnTensorFormat_t format; + cudnnDataType_t compute_type; + + std::vector in_dims; + std::vector filter_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + + platform::TensorDescriptor in_desc; + platform::FilterDescriptor filter_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor out_stats_desc; + platform::ConvolutionDescriptor conv_desc; +}; + +template +class CudnnNormConvolution { public: - CudnnNormConvolutionOp() - : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {} - ~CudnnNormConvolutionOp() {} - - void Init(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - cudnn_fwd_compute_type_ = platform::CudnnDataType::type; - dtype_ = platform::CudnnDataType::type; - format_ = CUDNN_TENSOR_NHWC; - - InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride, - dilate, group); - GetWorkspaceSize(ctx); + CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, const int &padding, + const int &stride, const int &dilation, + const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); } + ~CudnnNormConvolution() {} void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, T *filter_ptr, T *output_ptr, float *sum_ptr, float *sum_of_squares_ptr) { - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); + auto cudnn_handle = ctx.cudnn_handle(); + + CudnnFusionOp *fwd_op = GetForwardOp(ctx); + size_t workspace_size = RoundUp( + static_cast(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); + // Set variant_param // input ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); - fwd_op_.SetOpVariantParamAttrPtr( - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); + fwd_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); + // output ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); - workspace_handle.RunFunc( + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + + ctx.cudnn_workspace_handle().RunFunc( [&](void *workspace_ptr) { // workspace ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); // fused op execute - fwd_op_.Execute(handle); + fwd_op->Execute(cudnn_handle); }, - fwd_workspace_byte_); + workspace_size); } - // TBD - void Backward(const platform::CUDADeviceContext &ctx) {} + private: + CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetForward()); + + CudnnFusionOp *fwd_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *fwd_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS); + + // Set constant_param + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, + CUDNN_PARAM_YDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + + // conv desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + // filter desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC, + args_.filter_desc.desc()); + // output desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + // output_stats desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, + args_.out_stats_desc.desc()); + // batch_norm mode + fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + + // Make cudnn fused ops plan + fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return fwd_op; + }); + return fwd_op; + } private: - void InitDescriptors(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - // Set constant_param - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, - CUDNN_PARAM_YDATA_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - - std::vector pad_vec = {pad, pad}; - std::vector stride_vec = {stride, stride}; - std::vector dilate_vec = {dilate, dilate}; - int output_channel = filter_shape[0]; - std::vector stats_shape = {1, 1, 1, output_channel}; + NormConvolutionArgs args_; +}; - // set conv desc - conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc()); +template +class CudnnNormConvolutionGrad { + public: + CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, + const int &padding, const int &stride, + const int &dilation, const int &group) { + args_.Set(input_shape, filter_shape, output_shape, padding, stride, + dilation, group); + dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + ~CudnnNormConvolutionGrad() {} - // set input desc - in_desc_.set(input_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc()); + void Backward(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + T *filter_grad_ptr, bool use_addto = false) { + if (filter_grad_ptr) { + BackwardFilter(ctx, input_ptr, output_grad_ptr, filter_ptr, + filter_grad_ptr); + } + if (input_grad_ptr) { + BackwardData(ctx, input_ptr, output_grad_ptr, filter_ptr, input_grad_ptr, + use_addto); + } + } - // set filter desc - filter_desc_.set(filter_shape, format_, dtype_, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc()); + private: + void BackwardFilter(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *filter_grad_ptr) { + auto cudnn_handle = ctx.cudnn_handle(); - // set output desc - out_desc_.set(output_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc()); + CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); + size_t workspace_size = RoundUp( + static_cast(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); - // set output_stats desc - out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, - out_stats_desc_.desc()); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); - fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL); + ctx.cudnn_workspace_handle().RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + workspace_ptr); + // fused op execute + wgrad_op->Execute(cudnn_handle); + }, + workspace_size); } - void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) { - auto handle = ctx.cudnn_handle(); - fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + void BackwardData(const platform::CUDADeviceContext &ctx, T *input_ptr, + T *output_grad_ptr, T *filter_ptr, T *input_grad_ptr, + bool use_addto = false) { + auto cudnn_handle = ctx.cudnn_handle(); + size_t workspace_size = GetWorkspaceSizeBwdData(ctx); + + // Convolution dgrad followed optionally by batchnorm dgrad + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + ctx.cudnn_workspace_handle().RunFunc( + [&](void *cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr, + args_.out_desc.desc(), output_grad_ptr, + args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr, + workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr)); + }, + workspace_size); } - size_t fwd_workspace_byte_ = 0; + CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetBackward()); + + CudnnFusionOp *wgrad_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *wgrad_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD); + + wgrad_op->SetOpConstParamAttr( + {CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER, + CUDNN_PARAM_DWDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); - cudnnDataType_t dtype_; - cudnnDataType_t cudnn_fwd_compute_type_; - platform::TensorDescriptor in_desc_; - platform::FilterDescriptor filter_desc_; - platform::TensorDescriptor out_desc_; - platform::TensorDescriptor out_stats_desc_; - platform::ConvolutionDescriptor conv_desc_; - cudnnTensorFormat_t format_; + // conv desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, + args_.in_desc.desc()); + // filter desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC, + args_.filter_desc.desc()); + // output desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC, + args_.out_desc.desc()); + wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); - CudnnFusionOp fwd_op_; + // Make cudnn fused ops plan + wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return wgrad_op; + }); + return wgrad_op; + } + + size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t workspace_size = 0U; + auto handle = ctx.cudnn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, args_.filter_desc.desc(), args_.out_desc.desc(), + args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_, + &workspace_size)); + return RoundUp(workspace_size, 512); + } + + private: + NormConvolutionArgs args_; + cudnnConvolutionBwdDataAlgo_t dgrad_algo_; }; + #endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 125ed85642292..4c14029b99c69 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include #include @@ -29,56 +30,182 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(conv2d); +USE_OP(conv2d_grad); USE_OP_DEVICE_KERNEL(conv2d, CUDNN); +USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + + std::default_random_engine random(0); + std::uniform_real_distribution dis(0.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} + +template +void TransposeNchwToNhwc(const framework::Tensor &cpu_in, + framework::Tensor *cpu_out) { + auto in_dims = cpu_in.dims(); + EXPECT_EQ(cpu_in.dims().size(), 4); + + const T *cpu_in_ptr = cpu_in.data(); + T *cpu_out_ptr = cpu_out->mutable_data( + {in_dims[0], in_dims[2], in_dims[3], in_dims[1]}, platform::CPUPlace()); + + int64_t n = in_dims[0]; + int64_t c = in_dims[1]; + int64_t hw = in_dims[2] * in_dims[3]; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < hw; ++j) { + for (int k = 0; k < c; ++k) { + int dst_idx = i * hw * c + j * c + k; + int src_idx = i * c * hw + k * hw + j; + cpu_out_ptr[dst_idx] = cpu_in_ptr[src_idx]; + } + } + } +} -// get paddle conv2d op results as baseline template -void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, - const platform::CUDADeviceContext &ctx) { +void CheckOutput(const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + for (int i = 0; i < cpu_res.numel(); ++i) { + if (is_relative_atol) { + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + } +} + +// Use Paddle conv2d op results as baseline +void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + Tensor *cpu_output, int stride, int padding) { framework::Scope scope; - auto var_x = scope.Var("Input"); - auto tensor_x = var_x->GetMutable(); - auto var_w = scope.Var("Filter"); - auto tensor_w = var_w->GetMutable(); - auto var_y = scope.Var("Output"); - auto tensor_y = var_y->GetMutable(); + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output = scope.Var("Output")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(x, place, tensor_x); - TensorCopySync(w, place, tensor_w); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); attrs.insert({"use_cudnn", use_cudnn}); attrs.insert({"data_format", data_format}); - attrs.insert({"padding_algorithm", padding_algorithm}); auto op = framework::OpRegistry::CreateOp( "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}}, {{"Output", {"Output"}}}, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*tensor_y, place, y); - ctx.Wait(); + TensorCopySync(*output, platform::CPUPlace(), cpu_output); +} + +// Use Paddle conv2d_grad op results as baseline +void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + const Tensor &cpu_output_grad, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad, int stride, + int padding, int dilation) { + framework::Scope scope; + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output_grad = + scope.Var("Output@GRAD")->GetMutable(); + auto *input_grad = + scope.Var("Input@GRAD")->GetMutable(); + auto *filter_grad = + scope.Var("Filter@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); + TensorCopySync(cpu_output_grad, place, output_grad); + + framework::AttributeMap attrs; + bool use_cudnn = true; + std::string data_format = "NHWC"; + std::string padding_algorithm = "EXPLICIT"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + int groups = 1; + bool exhaustive_search = false; + bool use_addto = false; + attrs.insert({"use_cudnn", use_cudnn}); + attrs.insert({"data_format", data_format}); + attrs.insert({"padding_algorithm", padding_algorithm}); + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); + attrs.insert({"dilations", dilations}); + attrs.insert({"groups", groups}); + attrs.insert({"exhaustive_search", exhaustive_search}); + attrs.insert({"use_addto", use_addto}); + + auto op = framework::OpRegistry::CreateOp( + "conv2d_grad", {{"Input", {"Input"}}, + {"Filter", {"Filter"}}, + {"Output@GRAD", {"Output@GRAD"}}}, + {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad); } template -class TestCudnnNormConvOpForward { - public: - TestCudnnNormConvOpForward() { - batch_size_ = 2; - height_ = 8; - width_ = 8; - input_channels_ = 8; - output_channels_ = 32; - kernel_size_ = 1; - stride_ = 1; - pad_ = 0; +void ComputeSumAndSquareSum(const framework::Tensor &cpu_out, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + auto dims = cpu_out.dims(); + int64_t c = dims[3]; + + const T *cpu_out_ptr = cpu_out.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_out.numel() / c; ++i) { + float tmp_out = static_cast(cpu_out_ptr[i * c + j]); + tmp_sum += tmp_out; + tmp_sum_of_squares += tmp_out * tmp_out; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; } +} - TestCudnnNormConvOpForward(int batch_size, int height, int width, +template +class CudnnNormConvolutionTester { + public: + CudnnNormConvolutionTester(int batch_size, int height, int width, int input_channels, int output_channels, int kernel_size, int stride) { batch_size_ = batch_size; @@ -88,133 +215,210 @@ class TestCudnnNormConvOpForward { output_channels_ = output_channels; kernel_size_ = kernel_size; stride_ = stride; - pad_ = (kernel_size_ - 1) / 2; + padding_ = (kernel_size_ - 1) / 2; + out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1; + out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1; + SetUp(); } - ~TestCudnnNormConvOpForward() {} + ~CudnnNormConvolutionTester() {} - void SetUp() { - input_size_ = batch_size_ * height_ * width_ * input_channels_; - filter_size_ = - output_channels_ * input_channels_ * kernel_size_ * kernel_size_; - output_size_ = batch_size_ * height_ * width_ * output_channels_; - param_size_ = output_channels_; - - input_vec_.resize(input_size_); - filter_raw_vec_.resize(filter_size_); - filter_pro_vec_.resize(filter_size_); - - std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); - for (int i = 0; i < input_size_; ++i) { - input_vec_[i] = static_cast(dis(random)); - } - for (int i = 0; i < filter_size_; ++i) { - filter_raw_vec_[i] = static_cast(dis(random)); - } - // transpoes for filter - // NCHW->NHWC - for (int oc = 0; oc < output_channels_; ++oc) { - for (int kh = 0; kh < kernel_size_; ++kh) { - for (int kw = 0; kw < kernel_size_; ++kw) { - for (int ic = 0; ic < input_channels_; ++ic) { - int dst_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - kh * kernel_size_ * input_channels_ + - kw * input_channels_ + ic; - int src_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - ic * kernel_size_ * kernel_size_ + kh * kernel_size_ + - kw; - filter_pro_vec_[dst_idx] = filter_raw_vec_[src_idx]; - } - } - } + void CheckForward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + if (!Support(*ctx)) { + LOG(INFO) + << "Current test is only supported in the platforms with " + << "compatiblity greater than or equal to 70 and the kernel size " + << "must be equal to 1 or 3. Besides, when the kernel size is 1, " + << "the stride must be 1 if the compatiblity is equal to 70."; + return; } - framework::TensorFromVector(input_vec_, *ctx_, &input_); - input_.Resize({batch_size_, height_, width_, input_channels_}); - framework::TensorFromVector(filter_raw_vec_, *ctx_, &filter_raw_); - filter_raw_.Resize( - {output_channels_, input_channels_, kernel_size_, kernel_size_}); - framework::TensorFromVector(filter_pro_vec_, *ctx_, &filter_pro_); - filter_pro_.Resize( - {output_channels_, kernel_size_, kernel_size_, input_channels_}); - output_.Resize({batch_size_, height_, width_, output_channels_}); - base_output_.Resize({batch_size_, height_, width_, output_channels_}); - sum_.Resize({1, 1, 1, output_channels_}); - sum_of_squares_.Resize({1, 1, 1, output_channels_}); - ctx_->Wait(); + framework::Tensor cpu_output_base; + framework::Tensor cpu_sum_base; + framework::Tensor cpu_sum_of_square_base; + BaselineForward(*ctx, &cpu_output_base, &cpu_sum_base, + &cpu_sum_of_square_base); + + framework::Tensor cpu_output; + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square); + + // Check forward correctness between baseline and results of normconv. + CheckOutput(cpu_output, cpu_output_base, diff, is_relative_atol); + CheckOutput(cpu_sum, cpu_sum_base, diff, is_relative_atol); + CheckOutput(cpu_sum_of_square, cpu_sum_of_square_base, diff, + is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_input_grad_base; + framework::Tensor cpu_filter_nchw_grad_base; + framework::Tensor cpu_filter_nhwc_grad_base; + BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base); + TransposeNchwToNhwc(cpu_filter_nchw_grad_base, + &cpu_filter_nhwc_grad_base); + + framework::Tensor cpu_input_grad; + framework::Tensor cpu_filter_nhwc_grad; + FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad); + + // Check backward correctness between baseline and results of normconv. + CheckOutput(cpu_input_grad, cpu_input_grad_base, diff, is_relative_atol); + CheckOutput(cpu_filter_nhwc_grad, cpu_filter_nhwc_grad_base, diff, + is_relative_atol); + } + + private: + void SetUp() { + InitRandomTensor({batch_size_, height_, width_, input_channels_}, + &cpu_input_); + InitRandomTensor( + {output_channels_, input_channels_, kernel_size_, kernel_size_}, + &cpu_filter_nchw_); + // transpoes for filter, NCHW -> NHWC + TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); + InitRandomTensor( + {batch_size_, out_height_, out_width_, output_channels_}, + &cpu_output_grad_); + } + + void BaselineForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output_base, + framework::Tensor *cpu_sum_base, + framework::Tensor *cpu_sum_of_square_base) { + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base, + stride_, padding_); + ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, + cpu_sum_of_square_base); } - void BaselineForward() { - Conv2DForwardCompute(input_, filter_raw_, &base_output_, *ctx_); - ctx_->Wait(); + void BaselineBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad_base, + framework::Tensor *cpu_filter_grad_base) { + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_, + cpu_input_grad_base, cpu_filter_grad_base, stride_, + padding_, dilation_); } // get forward results of cudnn_norm_conv - void FusedForward() { - auto input_shape = framework::vectorize(input_.dims()); - auto filter_shape = framework::vectorize(filter_pro_.dims()); - auto output_shape = framework::vectorize(output_.dims()); - T *input_ptr = input_.data(); - T *filter_ptr = filter_pro_.data(); - T *output_ptr = output_.mutable_data(place_); - float *sum_ptr = sum_.mutable_data(place_); - float *sum_of_squares_ptr = sum_of_squares_.mutable_data(place_); - - std::shared_ptr> conv_op( - new op::CudnnNormConvolutionOp()); - conv_op->Init(*ctx_, input_shape, filter_shape, output_shape, pad_, stride_, - dilate_, group_); - conv_op->Forward(*ctx_, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_squares_ptr); - ctx_->Wait(); + void FusedForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output, framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output; + framework::Tensor sum; + framework::Tensor sum_of_square; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_ptr = output.mutable_data( + {batch_size_, out_height_, out_width_, output_channels_}, place); + float *sum_ptr = + sum.mutable_data({1, 1, 1, output_channels_}, place); + float *sum_of_square_ptr = + sum_of_square.mutable_data({1, 1, 1, output_channels_}, place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output.dims()); + op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, + output_shape, padding_, stride_, + dilation_, group_); + conv_op.Forward(ctx, input_ptr, filter_ptr, output_ptr, sum_ptr, + sum_of_square_ptr); + + TensorCopySync(output, platform::CPUPlace(), cpu_output); + TensorCopySync(sum, platform::CPUPlace(), cpu_sum); + TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - void Run() { - SetUp(); - BaselineForward(); - FusedForward(); + void FusedBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output_grad; + framework::Tensor input_grad; + framework::Tensor filter_grad; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + TensorCopySync(cpu_output_grad_, place, &output_grad); + + T *input_ptr = input.data(); + T *filter_ptr = filter_nhwc.data(); + T *output_grad_ptr = output_grad.data(); + T *input_grad_ptr = input_grad.mutable_data(input.dims(), place); + T *filter_grad_ptr = filter_grad.mutable_data(filter_nhwc.dims(), place); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output_grad.dims()); + op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, + output_shape, padding_, + stride_, dilation_, group_); + conv_grad_op.Backward(ctx, input_ptr, output_grad_ptr, filter_ptr, + input_grad_ptr, filter_grad_ptr); + + TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } - // check forward correctness between baseline and results of normconv. - void CheckOut(const T diff, bool is_relative_atol = false) { - std::vector base_output_vec, output_vec; - output_vec.resize(output_size_); - base_output_vec.resize(output_size_); - TensorToVector(base_output_, *ctx_, &base_output_vec); - TensorToVector(output_, *ctx_, &output_vec); - ctx_->Wait(); - - for (int i = 0; i < output_size_; ++i) { - if (is_relative_atol) { - EXPECT_LT( - std::abs((output_vec[i] - base_output_vec[i]) / base_output_vec[i]), - diff); - } else { - EXPECT_LT(std::abs(output_vec[i] - base_output_vec[i]), diff); + bool Support(const platform::CUDADeviceContext &ctx) { + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size_ == 3) || ((kernel_size_ == 1) && (stride_ == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size_ == 3) || (kernel_size_ == 1)) { + return true; } } + return false; } private: - int batch_size_, height_, width_, input_channels_, output_channels_; - int kernel_size_, stride_, pad_; - const int dilate_ = 1; + int batch_size_; + int height_; + int width_; + int out_height_; + int out_width_; + int input_channels_; + int output_channels_; + int kernel_size_; + int stride_; + int padding_; + const int dilation_ = 1; const int group_ = 1; - int input_size_, filter_size_, output_size_, param_size_; - framework::Tensor input_, filter_raw_, filter_pro_, output_, base_output_; - framework::Tensor sum_, sum_of_squares_; - std::vector input_vec_, filter_raw_vec_, filter_pro_vec_; + // Forward input + framework::Tensor cpu_input_; + framework::Tensor cpu_filter_nchw_; + framework::Tensor cpu_filter_nhwc_; - platform::CUDAPlace place_ = platform::CUDAPlace(0); - platform::CUDADeviceContext *ctx_ = - static_cast( - platform::DeviceContextPool::Instance().Get(place_)); + // Backward input + framework::Tensor cpu_output_grad_; }; // test for fp16, kernel = 1, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { +TEST(CudnnNormConvFp16, K1S1) { int batch_size = 4; int height = 56; int width = 56; @@ -222,15 +426,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { int output_channels = 32; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 3, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { +TEST(CudnnNormConvFp16, K3S1) { int batch_size = 4; int height = 56; int width = 56; @@ -238,15 +442,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { int output_channels = 32; int kernel_size = 3; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 1, output_channels = input_channels * 4 -TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { +TEST(CudnnNormConvFp16, K1S1O4) { int batch_size = 4; int height = 56; int width = 56; @@ -254,9 +458,25 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { int output_channels = 128; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( + batch_size, height, width, input_channels, output_channels, kernel_size, + stride); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); +} + +// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 +TEST(CudnnNormConvFp16, K1S2O4) { + int batch_size = 4; + int height = 8; + int width = 8; + int input_channels = 32; + int output_channels = 128; + int kernel_size = 1; + int stride = 2; + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3); } diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h new file mode 100644 index 0000000000000..2fdb3635e2e14 --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -0,0 +1,292 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +using CudnnDataType = platform::CudnnDataType; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct ScaleBiasAddReluArgs { + ScaleBiasAddReluArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::string &act_type, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) { + PADDLE_ENFORCE_EQ( + data_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of data_shape is expected to 4. But recieved " + "data_shape's size is %d, data_shape is [%s].", + data_shape.size(), framework::make_ddim(data_shape))); + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + PADDLE_ENFORCE_EQ( + bitmask_shape.size(), 3U, + platform::errors::InvalidArgument( + "The size of bitmask_shape is expected to 3. But recieved " + "bitmask_shape's size is %d, bitmask_shape is [%s].", + bitmask_shape.size(), framework::make_ddim(bitmask_shape))); + + in_desc.set(data_shape, format, dtype); + out_desc.set(data_shape, format, dtype); + equiv_scale_bias_desc.set(param_shape, format, dtype); + scale_bias_mean_var_desc.set(param_shape, format, param_dtype); + bitmask_desc.set(bitmask_shape, format, CUDNN_DATA_INT32); + // set activation desc + cudnnActivationMode_t mode = CUDNN_ACTIVATION_IDENTITY; + if (act_type != "") { + PADDLE_ENFORCE_EQ( + act_type, "relu", + platform::errors::InvalidArgument( + "Only relu activation supported in normalized convolution.")); + mode = CUDNN_ACTIVATION_RELU; + } + double dummy_clip = 0.0; + activation_desc.set(mode, dummy_clip); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor equiv_scale_bias_desc; + platform::TensorDescriptor scale_bias_mean_var_desc; + platform::TensorDescriptor bitmask_desc; + platform::ActivationDescriptor activation_desc; +}; + +template +class CudnnScaleBiasAddRelu { + public: + CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, + const std::string &act_type, bool fused_add, + bool has_shortcut, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) + : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK), + bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) { + fused_add_ = fused_add; + has_shortcut_ = has_shortcut; + args_.Set(act_type, data_shape, param_shape, bitmask_shape); + } + + ~CudnnScaleBiasAddRelu() {} + + void Forward(const platform::CUDADeviceContext &ctx, T *x_ptr, T *x_scale_ptr, + T *x_bias_ptr, T *out_ptr, int32_t *bitmask_ptr, + T *z_ptr = nullptr, T *z_scale_ptr = nullptr, + T *z_bias_ptr = nullptr) { + ForwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); + if (has_shortcut_) { + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); + } else { + if (fused_add_) { + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + } + } + + fwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + + // output ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + fwd_op_.Execute(handle); + }, + fwd_workspace_byte_); + } + + void Backward(const platform::CUDADeviceContext &ctx, T *dy_ptr, T *x_ptr, + float *scale_ptr, float *bias_ptr, float *saved_mean_ptr, + float *saved_invstd_ptr, int32_t *bitmask_ptr, T *dx_ptr, + T *dz_ptr, float *dscale_ptr, float *dbias_ptr, double eps) { + BackwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, + saved_invstd_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + bwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &bwd_workspace_byte_); + + // output ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DXDATA, dx_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DSCALE, dscale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, + &eps); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr); + } + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + bwd_op_.Execute(handle); + }, + bwd_workspace_byte_); + } + + private: + void ForwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, CUDNN_PARAM_YDATA_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_) { + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + } else if (fused_add_) { + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + if (has_shortcut_ || fused_add_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc()); + } + + // equiv scale/bias desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + if (has_shortcut_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + } + + // output desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + + // bitmask desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + void BackwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + bwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER, + CUDNN_PARAM_DXDATA_PLACEHOLDER, CUDNN_PARAM_BN_SCALE_PLACEHOLDER, + CUDNN_PARAM_BN_BIAS_PLACEHOLDER, CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc()); + if (has_shortcut_ || fused_add_) { + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc()); + } + + // scale/bias/mean/var desc for backward + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.scale_bias_mean_var_desc.desc()); + + // output desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DYDESC, args_.out_desc.desc()); + + // bitmask desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + bool fused_add_ = false; + bool has_shortcut_ = false; + size_t fwd_workspace_byte_; + size_t bwd_workspace_byte_; + ScaleBiasAddReluArgs args_; + CudnnFusionOp fwd_op_; + CudnnFusionOp bwd_op_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h new file mode 100644 index 0000000000000..fcfa405a52f9b --- /dev/null +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/operators/dropout_impl_util.h" +#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" +#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/fluid/operators/math/functors.h" + +namespace paddle { +namespace operators { + +/** + * Support two Dropouts in the use senarieo. + * This warpper can be used in FFN op. + * The DropoutParam will be used in the fused_dropout_act_bias, + * fused_residual_dropout_bias(pre_layer_norm=ture) or + * fused_layernorm_residual_dropout_bias(pre_layer_norm=false). +*/ +struct DropoutParam { + uint64_t seed; + float dropout_prob; + bool is_upscale_in_train; + bool is_test; + bool fix_seed; + int increment; + const framework::Tensor* tensor_seed; + int seed_val; + + DropoutParam() { + fix_seed = false; + seed = 0; + is_test = false; + is_upscale_in_train = false; + dropout_prob = 0.5; + tensor_seed = nullptr; + seed_val = 0; + } + + /** + * dropout_index: can be 0, 1, 2. 0 means there is only one dropout, + * 1 and 2 represent two dropout, the parameter name of dropout + * will be "dropout" + dropout_index + param name, such as dropout1_seed, + * dropout1_is_test. + */ + DropoutParam(const framework::ExecutionContext& context, + const int dropout_index) { + std::string pre_fix = "dropout"; + std::string str_index = std::to_string(dropout_index); + if (dropout_index > 0) { + pre_fix = pre_fix + str_index + "_"; + } else { + pre_fix = pre_fix + "_"; + } + dropout_prob = context.Attr(pre_fix + "prob"); + auto& dropout_implementation = + context.Attr(pre_fix + "implementation"); + is_upscale_in_train = (dropout_implementation == "upscale_in_train"); + is_test = context.Attr(pre_fix + "is_test"); + fix_seed = context.Attr(pre_fix + "fix_seed"); + + std::string str_seed = "Dropout"; + if (dropout_index > 0) { + str_seed = str_seed + str_index + "Seed"; + } else { + str_seed = str_seed + "Seed"; + } + tensor_seed = + context.HasInput(str_seed) ? context.Input(str_seed) : nullptr; + seed_val = context.Attr(pre_fix + "seed"); + } + + int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx, + const int offset) { + uint64_t tmp_increment; + GetSeedDataAndIncrement(ctx, tensor_seed, fix_seed, seed_val, offset, &seed, + &tmp_increment); + increment = static_cast(tmp_increment); + return increment; + } +}; + +template +class FusedDropoutHelper { + private: + int GetIncrement(const platform::CUDADeviceContext& ctx) { + const int VecSize = MAX_CACHE_BYTES / sizeof(T); + const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1; + auto config = + Get1DBlocksAnd2DGrids(ctx, static_cast(rows_), + static_cast(cols_), real_vec_size); + int increment = ((cols_ - 1) / (config.thread_per_block.x * + config.block_per_grid.x * real_vec_size) + + 1) * + real_vec_size; + increment = dropout_param_.UpdateSeedAndIncrement(ctx, increment); + return increment; + } + + public: + FusedDropoutHelper() {} + FusedDropoutHelper(const platform::CUDADeviceContext& ctx, const int rows, + const int cols, const DropoutParam& dropout_param) { + rows_ = rows; + cols_ = cols; + dropout_param_ = dropout_param; + } + + // out = residual + dropout( src + bias ) + void ResidualDropoutBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* residual, const T* bias, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + LaunchResidualDropoutBias( + rows_, cols_, increment, dropout_param_.seed, + dropout_param_.dropout_prob, dropout_param_.is_test, + dropout_param_.is_upscale_in_train, src, residual, bias, mask, out, + ctx); + } + + void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + const T* d_out, const MaskType* mask, T* d_src, + T* d_residual, T* d_bias) { + LaunchResidualDropoutBiasGrad( + d_out, mask, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + memory::Copy(cuda_place, d_residual, cuda_place, d_out, + rows_ * cols_ * sizeof(T), ctx.stream()); + } + + // out = dropout(activation(src + bias)) + void DropoutActBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* bias, const std::string& act_method, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + if (act_method == "gelu") { + GeluFunctor gelu; + LaunchDropoutActBias>( + gelu, dropout_param_.seed, rows_, cols_, dropout_param_.increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else if (act_method == "relu") { + math::ReluFunctor relu; + LaunchDropoutActBias>( + relu, dropout_param_.seed, rows_, cols_, increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const T* bias, const MaskType* mask, + T* d_src, T* d_bias, const std::string& act_method) { + if (act_method == "gelu") { + GeluGradFunctor gelu_grad; + LaunchDropoutActBiasGrad>( + gelu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else if (act_method == "relu") { + math::ReluGradFunctor relu_grad; + LaunchDropoutActBiasGrad>( + relu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + protected: + int rows_; + int cols_; + DropoutParam dropout_param_; +}; + +template +class FusedDropoutLayerNormHelper : public FusedDropoutHelper { + public: + FusedDropoutLayerNormHelper() {} + FusedDropoutLayerNormHelper(const int rows, const int cols, + const float epsilon) { + using U = LayerNormParamType; + this->rows_ = rows; + this->cols_ = cols; + epsilon_ = epsilon; + } + + FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx, + const int rows, const int cols, + const DropoutParam& dropout_param, + const float epsilon) + : FusedDropoutHelper(ctx, rows, cols, dropout_param) { + using U = LayerNormParamType; + epsilon_ = epsilon; + } + + // call layer_norm + void LayerNorm(const platform::CUDADeviceContext& ctx, const T* src, + const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + switch (GetDesiredBlockDim(this->cols_)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward< + T, U, kBlockDim><<rows_, kBlockDim, 0, ctx.stream()>>>( + src, gamma, beta, out, mean, variance, epsilon_, this->cols_)); + } + } + + void LayerNormGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const LayerNormParamType* gamma, + const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_src, + LayerNormParamType* d_scale, + LayerNormParamType* d_bias) { + using U = LayerNormParamType; + LayerNormBackward(src, dout, gamma, mean, variance, d_src, d_scale, + d_bias, epsilon_, this->rows_, this->cols_, ctx); + } + + // out = layernorm(residual + dropout(src + bias)) + void LayernormResidualDropoutBias( + const platform::CUDADeviceContext& ctx, const T* src, const T* residual, + const T* bias, const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* dropout_out, MaskType* mask, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + int vec_size = MAX_CACHE_BYTES / sizeof(T); + if (this->cols_ % vec_size != 0) { + vec_size = 1; + } + int threads = GetDesiredBlockDim(this->cols_ / vec_size); + int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size; + increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment); + LaunchLayernormResidualDropoutBias( + this->rows_, this->cols_, increment, this->dropout_param_.seed, + this->dropout_param_.dropout_prob, epsilon_, + this->dropout_param_.is_upscale_in_train, this->dropout_param_.is_test, + src, residual, bias, gamma, beta, mask, dropout_out, out, mean, + variance, ctx); + } + + void LayernormResidualDropoutBiasGrad( + const platform::CUDADeviceContext& ctx, const T* d_out, + const T* layernorm_src, const MaskType* mask, + const LayerNormParamType* gamma, const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_layernorm_src, + LayerNormParamType* d_scale, LayerNormParamType* d_layernorm_bias, + T* d_dropout_src, T* d_bias, T* d_residual) { + using U = LayerNormParamType; + LayerNormBackward(layernorm_src, d_out, gamma, mean, variance, + d_layernorm_src, d_scale, d_layernorm_bias, + epsilon_, this->rows_, this->cols_, ctx); + this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src, + d_residual, d_bias); + } + + protected: + float epsilon_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc new file mode 100644 index 0000000000000..4ef8320cbdecd --- /dev/null +++ b/paddle/fluid/operators/group_norm_op_npu.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/group_norm_op.h" +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct GroupNormFunction { + public: + explicit GroupNormFunction(const framework::ExecutionContext& ctx) + : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void ReduceMean(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void ReduceSum(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Div(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Sqrt(const Tensor* x, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + Tensor ReduceMeanToNG(const Tensor* x, const DataLayout& data_layout, + const int64_t N, const int64_t C, const int64_t H, + const int64_t W, const int G) { + Tensor y(x->type()); + // y.mutable_data( {N,G,1}, place ); + if (data_layout == DataLayout::kNCHW) { + y.mutable_data({N, G, 1}, place); + // shape of x is [N, G, C*H*W/G] + this->ReduceMean(x, &y, std::vector{2}); + } else { + y.mutable_data({N, 1, G}, place); + // shape of x is [N, C*H*W/G, G] + Tensor x_trans(x->type()); + x_trans.mutable_data({N, G, C * H * W / G}, place); + this->Transpose(x, &x_trans, std::vector{0, 2, 1}); + this->ReduceMean(&x_trans, &y, std::vector{2}); + } + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class GroupNormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* x = ctx.Input("X"); + + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* var = ctx.Output("Variance"); + const auto groups = ctx.Attr("groups"); + + auto place = ctx.GetPlace(); + Tensor xnorm(x->type()); + xnorm.mutable_data(x->dims(), place); + GroupNormFunction F(ctx); + if (data_layout != DataLayout::kNCHW) { + xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); + F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); + } else { + TensorCopy(*x, platform::NPUPlace(), &xnorm); + } + auto N = xnorm.dims()[0]; + auto C = xnorm.dims()[1]; + auto H = xnorm.dims()[2]; + auto W = xnorm.dims()[3]; + xnorm.Resize({N * groups, C * H * W / groups}); + std::vector axis = {1}; + auto reduce_dim = mean->dims(); + + mean->mutable_data({N * groups, 1}, place); + var->mutable_data({N * groups, 1}, place); + y->mutable_data(place); + F.ReduceMean(&xnorm, mean, axis); + + F.Sub(&xnorm, mean, &xnorm); + Tensor sqr(x->type()); + sqr.mutable_data(xnorm.dims(), place); + + F.Mul(&xnorm, &xnorm, &sqr); + F.ReduceMean(&sqr, var, axis); + Tensor std(x->type()); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + y->Resize(xnorm.dims()); + F.Div(&xnorm, &std, y); + y->Resize({N, C, H, W}); + if (scale) { + Tensor scale_t(scale->type()); + scale_t.ShareDataWith(*scale); + scale_t.Resize({C, 1, 1}); + F.Mul(y, &scale_t, y); + } + if (bias) { + Tensor bias_t(bias->type()); + bias_t.ShareDataWith(*bias); + bias_t.Resize({C, 1, 1}); + F.Add(y, &bias_t, y); + } + if (data_layout != DataLayout::kNCHW) { + F.Transpose(y, y, std::vector{0, 2, 3, 1}); + y->Resize({x->dims()}); + } + mean->Resize(reduce_dim); + var->Resize(reduce_dim); + } +}; + +template +class GroupNormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* y = ctx.Input("Y"); + auto* var = ctx.Input("Variance"); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* d_y = ctx.Input(framework::GradVarName("Y")); + const auto G = ctx.Attr("groups"); + + // init output + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_scale = ctx.Output(framework::GradVarName("Scale")); + auto* d_bias = ctx.Output(framework::GradVarName("Bias")); + + GroupNormFunction F(ctx); + auto place = ctx.GetPlace(); + auto _type = y->type(); + + Tensor xnorm(_type); + xnorm.mutable_data(y->dims(), place); + Tensor scale_share(_type); + scale_share.ShareDataWith(*scale); + Tensor bias_share(_type); + bias_share.ShareDataWith(*bias); + + int64_t N = y->dims()[0]; + int64_t C, H, W; + framework::DDim scale_bias_dim; + if (data_layout == DataLayout::kNCHW) { + C = y->dims()[1]; + H = y->dims()[2]; + W = y->dims()[3]; + scale_bias_dim = framework::make_ddim({C, 1, 1}); + } else { + C = y->dims()[3]; + H = y->dims()[1]; + W = y->dims()[2]; + scale_bias_dim = framework::make_ddim({1, 1, C}); + } + scale_share.Resize(scale_bias_dim); + bias_share.Resize(scale_bias_dim); + F.Sub(y, &bias_share, &xnorm); + F.DivNoNan(&xnorm, &scale_share, &xnorm); + + if (d_bias) { + d_bias->mutable_data(place); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(d_y, d_bias, std::vector{0, 2, 3}, false); + } else { + F.ReduceSum(d_y, d_bias, std::vector{0, 1, 2}, false); + } + } + if (d_scale) { + d_scale->mutable_data(place); + Tensor dy_xnorm(_type); + dy_xnorm.mutable_data(d_y->dims(), place); + F.Mul(d_y, &xnorm, &dy_xnorm); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 2, 3}); + } else { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 1, 2}); + } + } + + // std = Sqrt(var+epsilon), init shape = [ N, G ] + Tensor std(_type); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + // d_xnorm_std = dy_proc * scale / std + Tensor d_xnorm_std(_type); + d_xnorm_std.mutable_data(y->dims(), place); + F.Mul(d_y, &scale_share, &d_xnorm_std); + if (data_layout == DataLayout::kNCHW) { + xnorm.Resize({N, G, C * H * W / G}); + d_xnorm_std.Resize({N, G, C * H * W / G}); + std.Resize({N, G, 1}); + } else { + xnorm.Resize({N, C * H * W / G, G}); + d_xnorm_std.Resize({N, C * H * W / G, G}); + std.Resize({N, 1, G}); + } + F.Div(&d_xnorm_std, &std, &d_xnorm_std); + + // d_x = d_xnorm_std + // - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm + // - Mean ( d_xnorm_std, axis=1, keepdim=True ) + d_x->mutable_data(place); + d_x->Resize(xnorm.dims()); + F.Mul(&d_xnorm_std, &xnorm, d_x); + Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G); + F.Mul(&dx1, &xnorm, d_x); + + Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G); + + F.Sub(&d_xnorm_std, d_x, d_x); + F.Sub(d_x, &dx2, d_x); + + d_x->Resize(y->dims()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(group_norm, ops::GroupNormNPUKernel, + ops::GroupNormNPUKernel); +REGISTER_OP_NPU_KERNEL(group_norm_grad, ops::GroupNormGradNPUKernel, + ops::GroupNormGradNPUKernel); diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index b624d03cc8555..825229282f3da 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -99,10 +99,11 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { transed_out_dims[i] = out_dims[in_trans_perm[i]]; } transed_out_grad.mutable_data(transed_out_dims, ctx.GetPlace()); - framework::NPUAttributeMap in_trans_attr = {{"perm", in_trans_perm}}; - - const auto& in_trans_runner = NpuOpRunner( - "TransposeD", {*out_grad}, {transed_out_grad}, in_trans_attr); + NpuOpRunner in_trans_runner; + in_trans_runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(in_trans_perm)) + .AddOutput(transed_out_grad); in_trans_runner.Run(stream); Tensor sum_out; @@ -133,10 +134,12 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { for (int i = 1 + dim; i < x_dims.size(); ++i) { out_trans_perm.push_back(i); } - framework::NPUAttributeMap out_trans_attr = {{"perm", out_trans_perm}}; x_grad->mutable_data(ctx.GetPlace()); - const auto& out_trans_runner = - NpuOpRunner("TransposeD", {sum_out}, {*x_grad}, out_trans_attr); + NpuOpRunner out_trans_runner; + out_trans_runner.SetType("Transpose") + .AddInput(sum_out) + .AddInput(std::move(out_trans_perm)) + .AddOutput(*x_grad); out_trans_runner.Run(stream); } } diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6f8b89ce64523..fe9228135606d 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -1198,7 +1198,12 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } else if ("bicubic" == interp_method) { - KeBicubicInterpFw<<<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); @@ -1606,9 +1611,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; bool optimize_flag = false; +#ifndef __HIPCC__ optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) ? true : ((in_h == 1 && in_w == 1) ? true : false); +#endif if (optimize_flag & is_nchw) { KeBilinearInterpBwShareMemory< @@ -1623,7 +1630,12 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ratio_h, ratio_w, align_type_value, is_nchw); } } else if ("bicubic" == interp_method) { - KeBicubicInterpBw<<<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h new file mode 100644 index 0000000000000..fcfcdc28b1f00 --- /dev/null +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -0,0 +1,230 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace operators { +namespace kernel_primitives { +namespace details { + +static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) { + return ::Eigen::numext::exp(x); +} + +static __device__ __forceinline__ float Exp(float x) { return expf(x); } + +static __device__ __forceinline__ double Exp(double x) { return exp(x); } + +static __device__ __forceinline__ platform::float16 Log(platform::float16 x) { + return ::Eigen::numext::log(x); +} + +static __device__ __forceinline__ float Log(float x) { return logf(x); } + +static __device__ __forceinline__ double Log(double x) { return log(x); } + +} // namespace details + +/******************************** Unary Functor *******************************/ + +/** + * @brief Default unary exp functor + */ +template +struct ExpFunctor { + HOSTDEVICE inline ExpFunctor() {} + + HOSTDEVICE explicit inline ExpFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(details::Exp(x)); + } +}; + +/** + * @brief Default unary identity functor + */ +template +struct IdentityFunctor { + HOSTDEVICE inline IdentityFunctor() {} + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x); + } +}; + +/** + * @brief Default unary div functor. Divide by a constant + */ +template +struct DivideFunctor { + HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } + + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((Tx)(1.0 / n)) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x * n_inv); + } + + private: + Tx n_inv; +}; + +/** + * @brief Default unary square functor + */ +template +struct SquareFunctor { + HOSTDEVICE inline SquareFunctor() {} + + HOSTDEVICE explicit inline SquareFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x) * static_cast(x); + } +}; + +/****************************** Binary Functor ********************************/ + +/** + * @brief Default binary min functor + */ +template +struct MinFunctor { + inline T initial() { return static_cast(std::numeric_limits::max()); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b < a) ? b : a; + } +}; + +/** + * @brief Default binary max functor + */ +template +struct MaxFunctor { + inline T initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b > a) ? b : a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct AddFunctor { + inline T initial() { return static_cast(0.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b + a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct MulFunctor { + inline T initial() { return static_cast(1.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b * a; + } +}; + +/** + * @brief Default binary logic or functor + */ +template +struct LogicalOrFunctor { + inline T initial() { return static_cast(false); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b || a; + } +}; + +/** + * @brief Default binary logic and functor + */ +template +struct LogicalAndFunctor { + inline T initial() { return static_cast(true); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b && a; + } +}; + +/** + * @brief Default binary sub functor + */ +template +struct SubFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; + +/** + * @brief Default binary div functor + */ +template +struct DivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } +}; + +template +struct DivFunctor::value>::type> { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return a / b; + } +}; + +/** + * @brief Default binary floor divide functor + */ +template +struct FloorDivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return static_cast(std::trunc(a / b)); + } +}; + +} // namespace kernel_primitives +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 45ee4fd738174..9a4f8bb026b9d 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -16,6 +16,7 @@ #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" +#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 93c37ae425640..415d0c6dd8e0c 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -70,6 +70,46 @@ void compute_solve_eigen(const DeviceContext& context, } } +// only used for complex input +template +void SolveLinearSystem(T* matrix_data, T* rhs_data, T* out_data, int order, + int rhs_cols, int batch) { + using Treal = typename Eigen::NumTraits::Real; + + // cast paddle::complex into std::complex + std::complex* matrix_data_ = + reinterpret_cast*>(matrix_data); + std::complex* rhs_data_ = + reinterpret_cast*>(rhs_data); + std::complex* out_data_ = + reinterpret_cast*>(out_data); + + using Matrix = Eigen::Matrix, Eigen::Dynamic, + Eigen::Dynamic, Eigen::RowMajor>; + using InputMatrixMap = Eigen::Map; + using OutputMatrixMap = Eigen::Map; + + for (int i = 0; i < batch; ++i) { + auto input_matrix = + InputMatrixMap(matrix_data_ + i * order * order, order, order); + auto input_rhs = + InputMatrixMap(rhs_data_ + i * order * rhs_cols, order, rhs_cols); + auto output = + OutputMatrixMap(out_data_ + i * order * rhs_cols, order, rhs_cols); + + Eigen::PartialPivLU lu_decomposition(order); + lu_decomposition.compute(input_matrix); + + const Treal min_abs_piv = + lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_GT(min_abs_piv, Treal(0), + platform::errors::InvalidArgument( + "Something's wrong with SolveLinearSystem. ")); + + output = lu_decomposition.solve(input_rhs); + } +} + template class MatrixSolveFunctor { public: diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index b23b408e9c59a..6d7e8f3478c84 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -21,166 +21,387 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void MatMul2D(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("MatMul", {X, Y}, {*Out}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + runner.Run(stream); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const std::vector& dims, + const std::vector& brd_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = brd_dims.size(); + int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (brd_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class MatMulV2NPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - - if (x->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "MatMul", {*x, *y}, {*out}, - {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); + + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else if (x->dims().size() > 2) { - out->mutable_data(ctx.GetPlace()); + auto stream = ctx.template device_context().stream(); - const auto& runner = - NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, - {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + PADDLE_ENFORCE_EQ( + X->numel(), Y->numel(), + platform::errors::InvalidArgument( + "X's numbers must be equal to Y's numbers," + "when X/Y's dims =1. But received X has [%d] elements," + "received Y has [%d] elements", + X->numel(), Y->numel())); + Out->Resize({1}); + Out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); + const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); runner.Run(stream); + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + y_ndim = 2; + out_ndim += 1; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (trans_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); } + + // Case 2: [M, K] x [K, N] = [M, N] + if (x_ndim == 2 && y_ndim == 2) { + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector vec_dim = {x_temp.numel() / K, K}; + x_temp.Resize(framework::make_ddim(vec_dim)); + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } + + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); } }; -template +template class MatMulV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - bool transpose_y = ctx.Attr("trans_y"); - auto stream = - ctx.template device_context() - .stream(); - - if (x->dims().size() == 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*dout, *x}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); - runner_dy.Run(stream); - } + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); + auto stream = ctx.template device_context().stream(); - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + Tensor dout_temp(dOut->type()); + dout_temp.Resize(X->dims()); + dout_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("BroadcastTo") + .AddInput(*dOut) + .AddInput(std::move(x_dims)) + .AddOutput(dout_temp) + .Run(stream); - runner_dy.Run(stream); + if (dX) { + dX->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); + runner_dx.Run(stream); + } + if (dY) { + dY->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); + runner_dy.Run(stream); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(framework::make_ddim(x_dims)); + if (trans_x) { + MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); + } else { + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); } + dX->Resize(X->dims()); } - } else if (x->dims().size() > 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); - - runner_dx.Run(stream); + if (dY) { + dY->Resize(framework::make_ddim(y_dims)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + dY->Resize(Y->dims()); + } + return; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - runner_dy.Run(stream); + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector x_vec_dim = {x_temp.numel() / K, K}; + dout_temp.Resize( + framework::make_ddim(std::vector{dout_temp.numel() / N, N})); + if (dX) { + dX->Resize(framework::make_ddim(x_vec_dim)); + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); + dX->Resize(X->dims()); + } + if (dY) { + x_temp.Resize(framework::make_ddim(x_vec_dim)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); } - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + } + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } - runner_dx.Run(stream); + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + + if (dX) { + if (x_dims == x_broadcast_dims) { + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if ((x->dims().size() == 3) && (dout->dims().size() == 3) && - (dy->dims().size() == 2)) { - framework::Tensor dout_tmp; - dout_tmp.ShareDataWith(*dout); - std::vector vec_dim = - framework::vectorize(dout_tmp.dims()); - std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; - dout_tmp.Resize(framework::make_ddim(vec_dim_v)); - - framework::Tensor x_tmp; - x_tmp.ShareDataWith(*x); - std::vector vec_dim_x = - framework::vectorize(x_tmp.dims()); - std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], - vec_dim_x[2]}; - x_tmp.Resize(framework::make_ddim(vec_dim_x_v)); - const auto& runner_dy = - NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } else { - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); - } + } else { + Tensor dx_temp(X->type()); + dx_temp.Resize(framework::make_ddim(x_broadcast_dims)); + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, + true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, + !trans_y); } + ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); + } + } + if (dY) { + if (y_dims == y_broadcast_dims) { + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); + } + } else { + Tensor dy_temp(Y->type()); + dy_temp.Resize(framework::make_ddim(y_broadcast_dims)); + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, + trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, + false); + } + ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); } } } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel, + ops::MatMulV2NPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel, + ops::MatMulV2GradNPUKernel); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index d992890adeec3..603a70458b0ce 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -257,7 +257,6 @@ namespace ops = paddle::operators; ops::grad_functor>); #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \ @@ -267,6 +266,8 @@ namespace ops = paddle::operators; __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); +REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor, + ReluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor, diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index ed265edf003e0..db1127b055c31 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 57a56776736ff..4cc96a48bd26f 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -23,6 +23,7 @@ namespace operators { using framework::DataLayout; using framework::Tensor; +using framework::LoDTensor; using mkldnn::memory; using mkldnn::primitive; using mkldnn::concat; @@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_format(platform::GetMKLDNNFormat(*dst_mem)); } }; + +template +class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + + const auto x = ctx.MultiInput("X"); + const auto* dout = ctx.Input(framework::GradVarName("Out")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + + for (size_t i = 0; i < dx.size(); ++i) { + if (dx[i] != nullptr) { + dx[i]->set_lod(x[i]->lod()); + } + } + + int axis = ctx.Attr("axis"); + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + } + + auto dout_vec_dims = framework::vectorize(dout->dims()); + + axis = ComputeAxis(axis, dout_vec_dims.size()); + + std::vector offset(dout_vec_dims.size(), 0); + + mkldnn::memory::data_type dout_type = + framework::ToMKLDNNDataType(dout->type()); + platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(), + dout_type, onednn_engine); + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + for (size_t i = 0; i < dx.size(); ++i) { + if (out_var_names[i] != framework::kEmptyVarName && + dx[i]->numel() != 0UL) { + auto dx_vec_dims = framework::vectorize(dx[i]->dims()); + auto slice_mem_p = reorder_handler.AcquireSubmemory( + dx_vec_dims, offset, reorder_src_memory_p); + + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dx[i], dx_vec_dims, dout->format(), ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p); + + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + + offset[axis] += dx[i]->dims()[axis]; + + dx[i]->set_layout(framework::DataLayout::kMKLDNN); + dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + } + astream.wait(); + } +}; + } // namespace operators } // namespace paddle @@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel); + +REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatGradMKLDNNOpKernel, + ops::ConcatGradMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 1b69dd7ea00c7..cce835e6bc035 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,27 +12,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_layout_transform.h" +#include + #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { - -using framework::DataLayout; -using mkldnn::memory; -using mkldnn::primitive; -using mkldnn::reorder; -using mkldnn::stream; -using platform::GetMKLDNNFormat; -using platform::to_void_cast; +namespace { inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, const int groups, @@ -78,7 +67,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights> { public: - ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, + ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, @@ -92,19 +81,19 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( - input->layout(), DataLayout::kMKLDNN, + input->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); + framework::DataLayout::kMKLDNN, input->layout())); PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Input tensor")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Filter tensor")); @@ -137,10 +126,10 @@ class ConvMKLDNNHandlerT if (bias) { PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, + bias->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); + framework::DataLayout::kMKLDNN, bias->layout())); PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Bias tensor.")); @@ -188,12 +177,12 @@ class ConvMKLDNNHandlerT std::transform(dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; }); - const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto src_tz = framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = framework::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - const auto dst_tz = paddle::framework::vectorize(output->dims()); + const auto dst_tz = framework::vectorize(output->dims()); const mkldnn::memory::dims stride_dims = strides; const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); @@ -204,29 +193,49 @@ class ConvMKLDNNHandlerT * the memory format preferred for best performance */ auto chosen_memory_format = MKLDNNMemoryFormat::any; - auto data_type = mkldnn::memory::data_type::f32; if (ctx.Attr("mkldnn_data_type") == "bfloat16" || std::is_same::value) data_type = mkldnn::memory::data_type::bf16; - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, - MKLDNNMemoryFormat::any); + mkldnn::memory::desc src_md, weights_md; + if (platform::is_int8()) { + src_md = platform::MKLDNNMemDesc( + src_tz, framework::ToMKLDNNDataType(input->type()), + chosen_memory_format); + weights_md = platform::MKLDNNMemDesc( + weights_tz, mkldnn::memory::data_type::s8, chosen_memory_format); + } else { + src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, + MKLDNNMemoryFormat::any); + } + const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; + float sum_scale = 1.0f; + std::vector output_shift_scale; + if (platform::is_int8()) + std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); + const mkldnn::primitive_attr conv_attr = CreatePostOps( - fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn); + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, + output_shift_scale, sum_scale); // for INT8 only! if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc(bias_tz, data_type, + MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, @@ -255,28 +264,28 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( - in->layout(), DataLayout::kMKLDNN, + in->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, in->layout())); + framework::DataLayout::kMKLDNN, in->layout())); PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Input tensor.")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Filter tensor.")); PADDLE_ENFORCE_EQ( - out_grad->layout(), DataLayout::kMKLDNN, + out_grad->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The output_grad tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, out_grad->layout())); + framework::DataLayout::kMKLDNN, out_grad->layout())); PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for output_grad tensor")); @@ -296,28 +305,25 @@ class ConvMKLDNNHandlerT std::vector dilations(begin(dilations_temp), end(dilations_temp)); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - int groups = ctx.Attr("groups"); - auto input_dims = in->dims(); auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); auto filter_dims = filter->dims(); auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + std::string padding_algorithm = + ctx.Attr("padding_algorithm"); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); auto src_tz = framework::vectorize(in->dims()); auto weights_tz = framework::vectorize(filter->dims()); + int groups = ctx.Attr("groups"); int g = std::max(groups, 1); platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(out_grad->dims()); + auto dst_tz = framework::vectorize(out_grad->dims()); /* create memory descriptor for conv backward without specified format * ('any') which lets a primitive (conv backward in this case) choose @@ -349,8 +355,14 @@ class ConvMKLDNNHandlerT mkldnn::primitive_attr conv_attr; if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, mkldnn::prop_kind::forward_training, @@ -377,6 +389,71 @@ class ConvMKLDNNHandlerT } } + std::tuple> get_int8_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_in_data = ctx.Attr("Scale_in"); + const auto& scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool is_multi_channel = scale_weights_data.size() > 1; + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale); + } + + std::tuple> get_int8_bias_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const auto& scale_in_data = ctx.Attr("Scale_in"); + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + + return std::make_tuple(mask_reorder, scale_bias_data); + } + mkldnn::primitive_attr CreatePostOps( std::string fuse_activation, float fuse_alpha, float fuse_beta, bool fuse_residual_conn, const std::vector output_shift_scale = {}, @@ -433,7 +510,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->bwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_d_p", false); + platform::to_void_cast(filter_data), "@weights_mem_d_p", false); } std::shared_ptr AcquireSrcMemoryWithReorder( @@ -480,11 +557,11 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, to_void_cast(in_mem_data), key_mem); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); - user_mem_p->set_data_handle(to_void_cast(in_mem_data)); + user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } @@ -494,7 +571,8 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const bool is_test) { + const bool is_test, const std::vector& scale_data = {1.0f}, + int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); @@ -511,12 +589,14 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_p", is_test); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, + scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool is_test) { + const framework::Tensor* bias, const bool is_test, + const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); if (is_test && bias_mem_p) { return bias_mem_p; @@ -527,8 +607,9 @@ class ConvMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( - user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast(bias_data), - "@bias_mem_p", is_test); + user_bias_md, this->fwd_pd_->bias_desc(), + platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, + scale_data, mask); } } @@ -536,8 +617,8 @@ class ConvMKLDNNHandlerT const framework::Tensor* residual_param) { void* residual_data = residual_param->type() == framework::DataTypeTrait::DataType() - ? to_void_cast(residual_param->data()) - : to_void_cast(residual_param->data()); + ? platform::to_void_cast(residual_param->data()) + : platform::to_void_cast(residual_param->data()); auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); if (residual_mem_p) { residual_mem_p->set_data_handle(residual_data); @@ -572,12 +653,14 @@ class ConvMKLDNNHandlerT } }; +} // anonymous namespace + template -class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; @@ -607,9 +690,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } template - void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { + void ComputeFP32(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const bool is_test = ctx.Attr("is_test"); @@ -656,407 +739,112 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p->execute(astream, args); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } template - void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { - const bool is_test = ctx.Attr("is_test"); - + void ComputeINT8(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto* input = ctx.Input("Input"); - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); - - PADDLE_ENFORCE_GE(input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); - PADDLE_ENFORCE_LE(input->dims().size(), 5, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); + const std::string& fuse_activation = + ctx.Attr("fuse_activation"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool is_conv3d = ctx.Attr>("strides").size() == 3U; - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); bool unsigned_output = (fuse_activation == "relu" || fuse_activation == "relu6"); - - const T* input_data = input->data(); - - auto src_tz = paddle::framework::vectorize(input->dims()); - - mkldnn::memory::data_type src_dt = - paddle::framework::ToMKLDNNDataType(input->type()); - - std::string key = - platform::CreateKey(dev_ctx, src_tz, src_dt, - ctx.InputName("Input") + ctx.InputName("Filter")); - bool need_s8_to_u8 = false; - std::shared_ptr conv_p; - std::shared_ptr src_memory_p; - std::shared_ptr user_src_memory_p; - std::shared_ptr dst_memory_p; - std::vector pipeline; - std::shared_ptr conv_pd; - std::shared_ptr handler; - - // This is workaround for hacky implementation - // of conv int8 mkl-dnn. Once conv fp32 and conv int8 - // are merged/unified, this will disappear - auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_conv_pd = key_tid + "@conv_pd"; - auto prim_key = key_tid + "@conv_p"; - auto dst_key = key_tid + "@dst_mem_p"; - auto src_key = key_tid + "@src_mem_p"; - auto weights_key = key_tid + "@weights_mem_p"; - auto bias_key = key_tid + "@bias_mem_p"; - auto user_src_key = key_tid + "@user_src_mem_p"; - auto user_residual_key = key_tid + "@user_residual_data_mem_p"; - auto src_reorder_key = key_tid + "@src_mem_preorder_p"; - auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p"; - - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + PADDLE_ENFORCE_NE( + is_conv3d, true, + platform::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, false, + platform::errors::Unimplemented( + "residual fusion does not support force output with fp32")); - if (conv_pd == nullptr || !is_test) { - float fuse_alpha = ctx.Attr("fuse_alpha"); - float fuse_beta = ctx.Attr("fuse_beta"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); - auto* filter = ctx.Input("Filter"); + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, + output, ctx.InputName("Input") + ctx.InputName("Filter")); - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - PADDLE_ENFORCE_GE(filter->dims().size(), 4, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); - PADDLE_ENFORCE_LE(filter->dims().size(), 5, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const bool is_multi_channel = scale_weights_data.size() > 1; + const int& groups = ctx.Attr("groups"); + const bool& is_test = ctx.Attr("is_test"); + int mask_reorder = + is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, is_test, scale_weights_data, mask_reorder); + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( - !fuse_residual_conn || !force_fp32_output, true, - platform::errors::Unimplemented( - "residual fusion does not support force output with fp32")); - - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); - - PADDLE_ENFORCE_EQ(bias->dims().size(), 1, - platform::errors::InvalidArgument( - "Bias must only have 1 dimension, i.e. X, but " - "got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), - end(dilations_temp)); - - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - bool is_conv3d = strides.size() == 3U; - - PADDLE_ENFORCE_NE(is_conv3d, true, - platform::errors::Unimplemented( - "int8 does not support conv3d currently")); - - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - int groups = ctx.Attr("groups"); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - int g = std::max(groups, 1); - - platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(output->dims()); - - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - - const K* filter_data = filter->data(); - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_out_data = - force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); - float sum_scale = - fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; - - bool is_multi_channel = scale_weights_data.size() > 1; - - int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] - : (weights_tz)[0]) - : 1; - std::vector output_shift_scale(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - if (scale_weights_data[i] == 0.0) - output_shift_scale[i] = - scale_out_data; // weights data will contain 0 - // in some models, then weights - // scale couldn't be calculated - else - output_shift_scale[i] = - static_cast(static_cast(scale_out_data) / - (static_cast(scale_in_data) * - static_cast(scale_weights_data[i]))); - } - - auto user_src_md = - platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - auto chosen_memory_format = MKLDNNMemoryFormat::any; - - std::vector bias_tz; - - auto src_md = - platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - handler.reset( - new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key)); - // create a conv primitive descriptor and save it for usage in backward - auto propagation = is_test ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; - - if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, - MKLDNNMemoryFormat::x); - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, bias_md, dst_md, strides, dilations, paddings, - mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } else { - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, paddle::none, dst_md, strides, dilations, - paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = - is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, true, scale_weights_data, - mask_reorder); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ( - output->dims(), residual_param->dims(), - platform::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, but got output's dimension = %d" - " and residual param's dimension =%d .", - output->dims().size(), residual_param->dims().size())); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - if (residual_param->format() != handler->GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_dt, residual_param->format()); - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } else { - output->ShareDataWith(*residual_param); - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - - // create convolution op primitive - conv_p = handler->AcquireConvolution(); - if (bias) { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel ? 1 << 0 : 1; - int count = - is_multi_channel - ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) - : 1; - std::vector scale_bias_data(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - scale_bias_data[i] = scale_in_data * scale_weights_data[i]; - } - bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( - user_bias_memory_p, pipeline, is_test, true, scale_bias_data, - mask_reorder); - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } - } else { - auto src_memory_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if (src_memory_reorder_p) { - user_src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - src_memory_reorder_p->execute(astream, *user_src_memory_p, - *src_memory_p); - astream.wait(); - } - } else if (src_memory_p) { - src_memory_p->set_data_handle(to_void_cast(input_data)); - } - auto weights_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(weights_key)); + output->dims(), residual_param->dims(), + platform::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), residual_param->dims().size())); dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - output->ShareDataWith(*residual_param); - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } - platform::SetDstMemoryHandler(ctx, output, handler, dst_memory_p); + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (platform::MKLDNNGetDataType() == + mkldnn::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } - auto residual_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(residual_reorder_key)); - if (residual_reorder_p) { - auto user_residual_data_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_residual_key)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - residual_reorder_p->execute(astream, *user_residual_data_p, - *dst_memory_p); - astream.wait(); - } - } + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}; - auto bias_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(bias_key)); + if (bias) { + float mask_reorder; + std::vector scale_bias_data; + std::tie(mask_reorder, scale_bias_data) = + handler.get_int8_bias_scales(ctx); - if (bias_memory_p) { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, is_test, scale_bias_data, mask_reorder); + args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + conv_p->execute(astream, args); astream.wait(); + if (need_s8_to_u8) { output->mutable_data(ctx.GetPlace()); } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } }; template -class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL ConvGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); @@ -1105,18 +893,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); astream.wait(); - filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_layout(framework::DataLayout::kMKLDNN); // in OneDNN groups in convolution are treated as separate dimension // which is not the case in paddlepaddle - auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p); + auto filter_fmt = platform::GetMKLDNNFormat(*diff_weights_memory_p); // For convolution with groups convert from blocked to NCHW // otherwise there will be problems in next operators working on this data if (g > 1) { - memory::data_type in_type = framework::ToMKLDNNDataType(filter->type()); + mkldnn::memory::data_type in_type = + framework::ToMKLDNNDataType(filter->type()); // for 3d conv with groups (six dimensional data reorder to goidhw) // for 2d conv with groups (five dimensional data reorder to goihw) - // auto weights_tz = paddle::framework::vectorize(filter->dims()); + // auto weights_tz = framework::vectorize(filter->dims()); auto weights_tz = diff_weights_memory_p->get_desc().dims(); mkldnn::memory::format_tag out_format = @@ -1168,8 +957,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); - input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + input_grad->set_layout(framework::DataLayout::kMKLDNN); + input_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p)); } } }; diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 57a3c38559316..c332b9194164e 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -148,8 +148,8 @@ class MatMulV2MKLDNNKernel if (x_dims.size() == 1) { x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; } else if (x_dims.size() == 2) { - x_bd_dims[2] = x_dims[1]; - x_bd_dims[1] = x_dims[0]; + x_bd_dims[x_bd_dims.size() - 1] = x_dims[1]; + x_bd_dims[x_bd_dims.size() - 2] = x_dims[0]; } else { for (size_t i = 0; i < x_dims.size(); ++i) { x_bd_dims[i] = x_dims[i]; @@ -158,8 +158,8 @@ class MatMulV2MKLDNNKernel if (y_dims.size() == 1) { y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; } else if (y_dims.size() == 2) { - y_bd_dims[2] = y_dims[1]; - y_bd_dims[1] = y_dims[0]; + y_bd_dims[y_bd_dims.size() - 1] = y_dims[1]; + y_bd_dims[y_bd_dims.size() - 2] = y_dims[0]; } else { for (size_t i = 0; i < y_dims.size(); ++i) { y_bd_dims[i] = y_dims[i]; diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index e6a7f3e74fcc7..6c3f4ec06201a 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/flatten_op.h" #include "paddle/fluid/operators/squeeze_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace { +enum class ReshapeKernelOpName { + reshape, + reshape2, + squeeze, + squeeze2, + flatten, + flatten2, +}; +} // anonymous namespace + namespace paddle { namespace operators { @@ -41,7 +53,7 @@ static std::vector extract_shape( return vec_new_shape; } -template +template class ReshapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -55,43 +67,13 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { const auto& onednn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); - auto* xshape = ctx.Output("XShape"); auto* out = ctx.Output("Out"); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = x->dims(); - } else { - auto xshape_dims = xshape->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim x_dims, out_dims; + InferInOutShape(ctx, x_dims, out_dims); auto x_vec_dims = framework::vectorize(x_dims); - framework::DDim out_dims; - if (ctx.Type() == "squeeze") { - auto& axes = ctx.Attr>("axes"); - out_dims = GetOutputShape(axes, x_dims, true); - } else { - out_dims = out->dims(); - } - - if (ctx.Type().find("reshape") != std::string::npos) { - auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); - if (list_new_shape_tensor.size() > 0) { - auto new_shape = extract_shape(list_new_shape_tensor); - out_dims = ValidateShape(new_shape, x_dims); - } else if (ctx.HasInput("Shape")) { - auto* shape_tensor = ctx.Input("Shape"); - auto* shape_data = shape_tensor->data(); - - auto shape = - std::vector(shape_data, shape_data + shape_tensor->numel()); - out_dims = ValidateShape(shape, x_dims); - } - } - mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(), x_type, onednn_engine); @@ -116,6 +98,104 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { framework::vectorize(out_dims)))); } + void InferInOutShape(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeSqueezeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeSqueeze2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape kernel doesn not support that operator name")); + } + } + + void InferShapeReshapeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + x_dims = x->dims(); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + void InferShapeReshape2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + // in reshape1/2 ops "ShapeTensor" has highest priority and "Shape" has + // second highest priority + void ChangeReshapeOutDimsIfNeeded(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + auto new_shape = extract_shape(list_new_shape_tensor); + out_dims = ValidateShape(new_shape, x_dims); + } else if (ctx.HasInput("Shape")) { + auto* shape_tensor = ctx.Input("Shape"); + auto* shape_data = shape_tensor->data(); + + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ValidateShape(shape, x_dims); + } + } + + void InferShapeSqueezeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + x_dims = x->dims(); + const auto& axes = ctx.Attr>("axes"); + out_dims = GetOutputShape(axes, x_dims, true); + } + + void InferShapeSqueeze2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + } + + void InferShapeFlattenOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto x = ctx.Input("X"); + x_dims = x->dims(); + auto axes = ctx.Attr("axis"); + out_dims = framework::make_ddim( + FlattenKernel::GetOutputShape( + axes, x_dims)); + } + protected: static mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) { auto tensor_dims_size = tensor->dims().size(); @@ -223,8 +303,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { } }; -template -class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { +template +class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { RunKernel(ctx); @@ -239,14 +319,9 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = dx->dims(); - } else { - auto xshape_dims = ctx.Input("XShape")->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim dx_dims; + InferOutputShapeInGrad(ctx, dx_dims); + auto dout_vec_dims = framework::vectorize(dout->dims()); mkldnn::memory::data_type dout_type = @@ -265,44 +340,128 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - dx->Resize(x_dims); + dx->Resize(dx_dims); dx->set_layout(framework::DataLayout::kMKLDNN); dx->set_format(GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape( - framework::vectorize(x_dims)))); + framework::vectorize(dx_dims)))); } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_KERNEL(squeeze, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); -REGISTER_OP_KERNEL(squeeze2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferOutputShapeInGrad(const framework::ExecutionContext& ctx, + framework::DDim& x_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape grad kernel doesn not support that operator name")); + } + } -REGISTER_OP_KERNEL(reshape, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeReshapeSqueezeGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + auto* dx = ctx.Output(framework::GradVarName("X")); + dx_dims = dx->dims(); + } -REGISTER_OP_KERNEL(reshape_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferShapeReshape2Squeeze2Flatten2GradOp( + const framework::ExecutionContext& ctx, framework::DDim& dx_dims) const { + auto xshape_dims = ctx.Input("XShape")->dims(); + dx_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + } -REGISTER_OP_KERNEL(reshape2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeFlattenGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + dx_dims = ctx.Input("X")->dims(); + } +}; +} // namespace operators +} // namespace paddle -REGISTER_OP_KERNEL(reshape2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + squeeze, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index bb6549c111988..d10e94962d6a6 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -26,6 +26,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" +DECLARE_string(npu_precision_mode); + namespace paddle { namespace operators { @@ -404,6 +406,12 @@ void NpuOpRunner::Run(aclrtStream stream) const { VLOG(4) << "attr: " << attr_; VLOG(4) << "stream: " << stream; + if (!FLAGS_npu_precision_mode.empty()) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str())); + VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode; + } + aclError ret = aclopCompileAndExecute( op_type_.c_str(), input_descs_.size(), input_descs_.data(), input_buffers_.data(), output_descs_.size(), output_descs_.data(), diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 42477232e7ca1..3e7023bd1260f 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -14,7 +14,29 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" +#include "paddle/fluid/platform/fast_divmod.h" + +#if defined(__NVCC__) && CUDA_VERSION >= 11000 +/* Once CUDA_VERSION is beyond 11.0, cooperative_groups can be involved in + without adding --rdc=true compile flag, then L2_norm cuda kernel can be + set as a __device__ kernel rather than global kernel. On the contrary, + the compile flag shall be set in old version, which may affect the cuda + kernel performance in paddle, consequently, L2_norm kernel shall be set + as a __global__ kernel. +*/ +#include +#define LARS_FUNCTION_FLAG __device__ +#else +#define LARS_FUNCTION_FLAG __global__ +#endif + +#ifdef __HIPCC__ +#define LARS_BLOCK_SIZE 256 +#else +#define LARS_BLOCK_SIZE 512 +#endif namespace paddle { namespace operators { @@ -22,55 +44,207 @@ namespace operators { template using MultiPrecisionType = typename details::MPTypeTrait::Type; +__device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } +__device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } +__device__ __forceinline__ float Fma(float x, float y, float z) { + return fmaf(x, y, z); +} +__device__ __forceinline__ double Fma(double x, double y, double z) { + return fma(x, y, z); +} + +template +__device__ inline void VectorizeLarsUpdate( + const T* __restrict__ grad, const MT* __restrict__ param, + const MT* __restrict__ velocity, T* __restrict__ param_out, + MT* __restrict__ velocity_out, const MT mu, MT local_lr, + const MT lars_weight_decay, const MT rescale_grad, const int tid, + const int grid_stride, const int numel, + MT* __restrict__ master_param_out = nullptr) { + using VecType = paddle::platform::AlignedVector; + using VecMType = paddle::platform::AlignedVector; + int main = numel >> (VecSize >> 1); + int tail_offset = main * VecSize; + + const VecType* __restrict__ grad_vec = reinterpret_cast(grad); + const VecMType* __restrict__ param_vec = + reinterpret_cast(param); + const VecMType* __restrict__ velocity_vec = + reinterpret_cast(velocity); + VecType* param_out_vec = reinterpret_cast(param_out); + VecMType* velocity_out_vec = reinterpret_cast(velocity_out); + + VecMType* master_param_out_vec; + if (IsAmp) { + master_param_out_vec = reinterpret_cast(master_param_out); + } + + for (int i = tid; i < main; i += grid_stride) { + VecType param_out_tmp; + VecMType velocity_tmp, param_tmp; + VecType grad_data = grad_vec[i]; + VecMType param_data = param_vec[i]; + VecMType velocity_data = velocity_vec[i]; + +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT grad_val = static_cast(grad_data[j]) * rescale_grad; + velocity_tmp[j] = + Fma(velocity_data[j], mu, + local_lr * Fma(lars_weight_decay, param_data[j], grad_val)); + param_tmp[j] = param_data[j] - velocity_tmp[j]; + param_out_tmp[j] = static_cast(param_tmp[j]); + } + param_out_vec[i] = param_out_tmp; + velocity_out_vec[i] = velocity_tmp; + if (IsAmp) { + master_param_out_vec[i] = param_tmp; + } + } + + for (int i = tid + tail_offset; i < numel; i += grid_stride) { + MT grad_val = static_cast(grad[i]) * rescale_grad; + MT param_val = param[i]; + MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay, + param_val, grad_val)); + MT param_tmp = param_val - velocity_tmp; + param_out[i] = static_cast(param_tmp); + velocity_out[i] = velocity_tmp; + if (IsAmp) { + master_param_out[i] = param_tmp; + } + } +} + template -__global__ void MomentumLarsKernel( - const T* p, const T* g, const MT* v, - const MultiPrecisionType* learning_rate, const MT mu, const int64_t num, - const MT lars_coeff, const MT lars_weight_decay, - const MultiPrecisionType* p_norm, const MultiPrecisionType* g_norm, - T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out, - const MultiPrecisionType rescale_grad) { - const MT lr = static_cast(learning_rate[0]); - MT local_lr = lr; - const MT p_n = static_cast(p_norm[0]); - const MT g_n = static_cast(g_norm[0]); +LARS_FUNCTION_FLAG void L2NormKernel( + const T* __restrict__ p_data, const T* __restrict__ g_data, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, + const int repeat_times, const int64_t numel, const MT rescale_grad, + MT* __restrict__ p_n = nullptr, MT* __restrict__ g_n = nullptr) { + int tid = threadIdx.x + blockDim.x * blockIdx.x; + int grid_stride = LARS_BLOCK_SIZE * gridDim.x; + const MT rescale_grad_pow = rescale_grad * rescale_grad; + __shared__ MT s_buffer[2]; + s_buffer[0] = static_cast(0); + s_buffer[1] = static_cast(0); + MT p_tmp_val = static_cast(0); + MT g_tmp_val = static_cast(0); - if (lars_weight_decay > static_cast(0) && p_n > static_cast(0) && - g_n > static_cast(0)) { - local_lr = - lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon); + if (repeat_times == 0) { + if (tid < numel) { + p_tmp_val = static_cast(p_data[tid]); + g_tmp_val = static_cast(g_data[tid]); + } + s_buffer[0] += math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + } else { + /* To avoid occupy too much temp buffer. Hence, slice the whole data into 2 + parts, the front of them whose quantity is excatly multiple of grid-thread + number, and this part of data is delt in for loop, the rest of data is delt + with another step to avoid visiting data address beyond bound. */ + for (int i = 0; i < repeat_times; ++i) { + p_tmp_val = static_cast(p_data[tid]); + g_tmp_val = static_cast(g_data[tid]); + tid += grid_stride; + s_buffer[0] += + math::blockReduceSum(p_tmp_val * p_tmp_val, FINAL_MASK); + s_buffer[1] += + math::blockReduceSum(g_tmp_val * g_tmp_val, FINAL_MASK); + __syncthreads(); + } + MT p_val = 0; + MT g_val = 0; + if (tid < numel) { + p_val = static_cast(p_data[tid]); + g_val = static_cast(g_data[tid]); + } + s_buffer[0] += math::blockReduceSum(p_val * p_val, FINAL_MASK); + s_buffer[1] += math::blockReduceSum(g_val * g_val, FINAL_MASK); } - CUDA_KERNEL_LOOP(i, num) { - MT grad = static_cast(g[i]) * static_cast(rescale_grad); - MT param = master_p ? master_p[i] : static_cast(p[i]); + __syncthreads(); + + if (threadIdx.x == 0) { + p_buffer[blockIdx.x] = s_buffer[0]; + g_buffer[blockIdx.x] = s_buffer[1]; + } + +#if CUDA_VERSION >= 11000 + // Grid sync for completely writring partial result back to gloabl memory + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + cg.sync(); + MT p_partial_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; + MT g_partial_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; + *p_n = Sqrt(math::blockReduceSum(p_partial_sum, FINAL_MASK)); + *g_n = Sqrt(rescale_grad_pow * + math::blockReduceSum(g_partial_sum, FINAL_MASK)); +#endif +} - MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param); - MT p_new = param - v_new; +template +__global__ void MomentumLarsKernel( + const T* __restrict__ param, const T* __restrict__ grad, + const MT* __restrict__ velocity, T* param_out, MT* velocity_out, + const MT* __restrict__ master_param, MT* __restrict__ master_param_out, + const MT* __restrict__ learning_rate, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const MT mu, const MT lars_coeff, + const MT lars_weight_decay, const MT epsilon, const MT rescale_grad, + const int repeat_times, const int thresh, const int64_t numel) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; +#if CUDA_VERSION >= 11000 + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(param, grad, p_buffer, g_buffer, repeat_times, numel, + rescale_grad, ¶m_norm, &grad_norm); +#else + const MT rescale_grad_pow = rescale_grad * rescale_grad; + MT param_parital_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; + MT grad_parital_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; + __syncthreads(); + MT param_norm = + Sqrt(math::blockReduceSum(param_parital_norm, FINAL_MASK)); + MT grad_norm = Sqrt(rescale_grad_pow * + math::blockReduceSum(grad_parital_norm, FINAL_MASK)); +#endif - v_out[i] = v_new; - p_out[i] = static_cast(p_new); - if (master_p_out) master_p_out[i] = p_new; + const MT lr = learning_rate[0]; + MT local_lr = lr; + if (lars_weight_decay > static_cast(0)) { + local_lr = lr * lars_coeff * param_norm / + (Fma(lars_weight_decay, param_norm, grad_norm) + epsilon); + } + + if (master_param_out) { + VectorizeLarsUpdate(grad, master_param, velocity, param_out, + velocity_out, mu, local_lr, + lars_weight_decay, rescale_grad, tid, + grid_stride, numel, master_param_out); + } else { + if (std::is_same::value || + std::is_same::value) { + // As for multiple-precision, type T and MT cannot be more than fp16 or + // fp32, Then, the maximum data IO size could be set to 4. + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); + } else { + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); + } } } template class LarsMomentumOpCUDAKernel : public framework::OpKernel { - using MPDType = MultiPrecisionType; + using MT = MultiPrecisionType; public: void Compute(const framework::ExecutionContext& ctx) const override { const bool multi_precision = ctx.Attr("multi_precision"); - if (multi_precision) { - InnerCompute(ctx, multi_precision); - } else { - InnerCompute(ctx, multi_precision); - } - } - - private: - template - void InnerCompute(const framework::ExecutionContext& ctx, - const bool multi_precision) const { auto param_out = ctx.Output("ParamOut"); auto velocity_out = ctx.Output("VelocityOut"); auto param = ctx.Input("Param"); @@ -78,8 +252,13 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { auto grad = ctx.Input("Grad"); auto learning_rate = ctx.Input("LearningRate"); + int64_t numel = param->numel(); + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; const framework::Tensor* master_param = nullptr; framework::Tensor* master_param_out = nullptr; + const MT* master_param_data = nullptr; + MT* master_param_out_data = nullptr; + if (multi_precision) { bool has_master = ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); @@ -90,56 +269,114 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel { "the attr `multi_precision` is true")); master_param = ctx.Input("MasterParam"); master_param_out = ctx.Output("MasterParamOut"); + master_param_data = master_param->data(); + master_param_out_data = + master_param_out->mutable_data(ctx.GetPlace()); } - - const MT* master_p = multi_precision ? master_param->data() : nullptr; - MT* master_p_out = multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; - - T* p_out = param_out->mutable_data(ctx.GetPlace()); - MT* v_out = velocity_out->mutable_data(ctx.GetPlace()); - MT mu = static_cast(ctx.Attr("mu")); MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); MT lars_weight_decay = static_cast(ctx.Attr("lars_weight_decay")); MT epsilon = static_cast(ctx.Attr("epsilon")); - MPDType rescale_grad = - static_cast(ctx.Attr("rescale_grad")); - - auto* p = param->data(); - auto* g = grad->data(); - auto* v = velocity->data(); - auto* lr = learning_rate->data(); - - int block = 512; - int grid = (param->numel() + block - 1) / block; - - auto eigen_p = framework::EigenVector::Flatten(*param); - auto eigen_g = framework::EigenVector::Flatten(*grad); - // calculate norms using eigein and launch the kernel. - framework::Tensor p_norm_t, g_norm_t; - p_norm_t.Resize({1}); - g_norm_t.Resize({1}); - auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); - auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); - - auto* place = ctx.template device_context().eigen_device(); - - // eigen unsupport fp16 l2-norm - ep_norm.device(*place) = - eigen_p.template cast().square().sum().sqrt(); - eg_norm.device(*place) = - (eigen_g.template cast() * rescale_grad).square().sum().sqrt(); + MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); - MomentumLarsKernel< - T, MT><<>>( - p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out, + auto* param_data = param->data(); + auto* grad_data = grad->data(); + auto* velocity_data = velocity->data(); + auto* lr = learning_rate->data(); + auto& cuda_ctx = ctx.template device_context(); + T* param_out_data = param_out->mutable_data(ctx.GetPlace()); + MT* velocity_out_data = velocity_out->mutable_data(ctx.GetPlace()); + +#if CUDA_VERSION >= 11000 + /* + Once model trainning with lars optimizer, whose principal implementation + is achieved by following two steps: + 1. Figure out the L2 norm statistic result of grad data and param data. + 2. Update param and velocity data with usage of L2 norm statistic result. + + Orignally, these two steps were fulfilled by respective eigen function and + cuda kernel, however the overhead of eigen function occupied much ratio in + total, consequently affect the performance of lars op, make it necessary + to combine 2 steps into one cuda kernel. + Since the step1 is l2 norm statistic, grid level reduce is needed. To + achieve this and continuous calculation of step 2 in only one global + lanuch, essential basis is to control all grid-threads while running. Apart + from normal lanuch form, cuda9.0 provides `cudaLaunchCooperativeKernel` + api : + - The thread quantity shall less than pyhsical SM limited threads + - Launches a device function where thread blocks can cooperate and + synchronize as they execute. + */ + // Figure out how many blocks can be active in each sm. + int num_blocks_per_sm = 0; + cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, + MomentumLarsKernel, + LARS_BLOCK_SIZE, sizeof(MT)); + int sm_num = cuda_ctx.GetSMCount(); + int grid_real = + std::min(std::min(sm_num * num_blocks_per_sm, grid), LARS_BLOCK_SIZE); + framework::Tensor tmp_buffer_t = + ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); + auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); + auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; + int grid_stride = LARS_BLOCK_SIZE * grid; + int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + int thresh = 0; + + // Uniform kernel parameter for cudaLaunchCooperativeKernel + void* cuda_param[] = { + reinterpret_cast(¶m_data), + reinterpret_cast(&grad_data), + reinterpret_cast(&velocity_data), + reinterpret_cast(¶m_out_data), + reinterpret_cast(&velocity_out_data), + reinterpret_cast(&master_param_data), + reinterpret_cast(&master_param_out_data), + reinterpret_cast(&lr), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&lars_weight_decay), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&repeat_times), + reinterpret_cast(&thresh), // Just a placeholder + reinterpret_cast(&numel)}; + // Lanuch all sm theads. + cudaLaunchCooperativeKernel( + reinterpret_cast(MomentumLarsKernel), grid_real, + LARS_BLOCK_SIZE, cuda_param, 0, cuda_ctx.stream()); +#else + // Determine to read 4 fp16 or float data once, but 2 double data once. + int grid_lars = + sizeof(T) < sizeof(double) + ? (numel + (LARS_BLOCK_SIZE << 2) - 1) / (LARS_BLOCK_SIZE << 2) + : (numel + (LARS_BLOCK_SIZE << 1) - 1) / (LARS_BLOCK_SIZE << 1); + + int grid_norm = std::min(grid, LARS_BLOCK_SIZE); + framework::Tensor p_buffer_t = + ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); + auto* p_buffer = p_buffer_t.mutable_data(ctx.GetPlace()); + auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; + + const int grid_stride = LARS_BLOCK_SIZE * grid_norm; + const int repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + + L2NormKernel<<>>( + param_data, grad_data, p_buffer, g_buffer, repeat_times, numel, rescale_grad); + + MomentumLarsKernel< + T, MT><<>>( + param_data, grad_data, velocity_data, param_out_data, velocity_out_data, + master_param_data, master_param_out_data, lr, p_buffer, g_buffer, mu, + lars_coeff, lars_weight_decay, epsilon, rescale_grad, 0, grid_norm, + numel); // 0 is just a placeholder. +#endif } }; diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index d3faa2c8460f2..da637dfeb237d 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -25,22 +25,26 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "Tensor, " + "(Tensor), " "the input of PSROIPoolOp. " "The format of input tensor is NCHW. Where N is the batch size, " "C is the number of input channels, " "H is the height of the input feature map, and " "W is the width. The data type can be float32 or float64"); AddInput("ROIs", - "LoDTensor, " + "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]. " "where (x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates. " "The roi batch index can be calculated from LoD."); + AddInput("RoisNum", + "(Tensor), " + "The number of RoIs in each image.") + .AsDispensable(); AddOutput("Out", - "Tensor, " + "(Tensor), " "the output of PSROIPoolOp is a 4-D Tensor with shape " "(num_rois, output_channels, pooled_h, pooled_w). " "The data type is the same as `x` "); @@ -65,8 +69,6 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "the pooled output width.") .SetDefault(1); AddComment(R"Doc( -**PSROIPool Operator,** `rois` **of this op should be a LoDTensor** - Position sensitive region of interest pooling (also known as PSROIPooling) is to perform position-sensitive average pooling on regions of interest specified by input, takes as input N position-sensitive score maps and a list of num_rois regions of interest. @@ -106,7 +108,14 @@ class PSROIPoolOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]")); - + if (ctx->HasInput("RoisNum")) { + auto rois_num_dims = ctx->GetInputDim("RoisNum"); + PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, + platform::errors::InvalidArgument( + "The second dimension of RoisNum should " + "be 1, but received dimension is %d", + rois_num_dims.size())); + } int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); int output_channels = ctx->Attrs().Get("output_channels"); @@ -184,6 +193,7 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { op->SetType("psroi_pool_grad"); op->SetInput("X", this->Input("X")); op->SetInput("ROIs", this->Input("ROIs")); + op->SetInput("RoisNum", this->Input("RoisNum")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetAttrMap(this->Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu index 748b6036008f1..f69edfc1fcfec 100644 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -185,34 +185,67 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; if (rois_num == 0) return; - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ(rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id + int rois_batch_size; framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_data, sizeof(int) * rois_batch_size, 0); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_list[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + platform::errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, rois_num_with_lod)); + + // set rois batch id + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); @@ -257,14 +290,30 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_t->data(), sizeof(int) * rois_batch_size, 0); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 4f4cb24844b8c..4d7e9ce295fc8 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -40,6 +40,13 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int width = in_dims[3]; int rois_num = rois->dims()[0]; + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + platform::errors::InvalidArgument( + "the channels of input " + "X should equal the product of " + "output_channels x pooled_height x pooled_width")); + auto in_stride = framework::stride(in_dims); auto out_stride = framework::stride(out->dims()); @@ -49,32 +56,52 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + rois_batch_size, batch_size)); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_data[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument("the rois_batch_size and input(X) " + "batch_size should be the same.")); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num_with_lod, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and lod must be the same")); + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - T* output_data = out->mutable_data(ctx.GetPlace()); const T* input_rois = rois->data(); @@ -93,7 +120,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; T roi_end_h = static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); @@ -172,15 +198,28 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - const T* input_rois = rois->data(); const T* output_grad_data = output_grad->data(); T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4760270caa3c6..28b6ebc243322 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -34,6 +34,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -705,8 +706,16 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, if (config.reduce_num == 1) { auto out_dims = y->dims(); - framework::TensorCopy(x, y->place(), y); - y->Resize(out_dims); + if (x.type() == y->type()) { + framework::TensorCopy(x, y->place(), y); + y->Resize(out_dims); + } else { + auto* dev_ctx = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(x.place())); + framework::VisitDataType( + static_cast(y->type()), + CastOpFunctor(&x, y, *dev_ctx)); + } return; } diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 51ff8f189b151..6f244b1a4cb8f 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -229,7 +229,7 @@ class ReshapeOp : public framework::OperatorWithKernel { // by now we require that if the input tensor is zero shape, the target // shape of output must be zero if (in_size == 0) { - PADDLE_ENFORCE_EQ( + PADDLE_ENFORCE_LE( capacity, in_size, platform::errors::InvalidArgument( "The 'shape' in ReshapeOp is invalid. " @@ -248,13 +248,13 @@ class ReshapeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -366,13 +366,13 @@ class ReshapeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -557,13 +557,13 @@ class Reshape2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index ac352876e7871..04e4dc62b039b 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -142,10 +142,15 @@ static void ShareVarsIntoScope(const std::vector &vars, static void ShareVarsFromScope(const std::vector &vars, const std::vector &var_names, + const BlockDesc &global_block, framework::Scope *scope) { for (size_t i = 0; i < vars.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't findthem in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. if (var_names[i] == framework::kEmptyVarName || - var_names[i] == "Fake_var") { + var_names[i] == "Fake_var" || !global_block.HasVar(var_names[i])) { VLOG(2) << "find variable name is " << var_names[i] << ", skip it!"; continue; } @@ -214,8 +219,10 @@ class RunProgramOpKernel : public framework::OpKernel { details::ShareVarsIntoScope(input_vars, input_var_names, &scope); details::ShareVarsIntoScope(param_vars, param_names, &scope); + auto *global_block = ctx.Attr("global_block"); + if (end_op_index > start_op_index) { - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad=*/false, program_id, &scope); @@ -240,8 +247,10 @@ class RunProgramOpKernel : public framework::OpKernel { parallel_executor->RunWithoutFetch(skip_eager_delete_vars); } // Step 4. Get Output - details::ShareVarsFromScope(output_vars, output_var_names, &scope); - details::ShareVarsFromScope(dout_vars, dout_var_names, &scope); + details::ShareVarsFromScope(output_vars, output_var_names, *global_block, + &scope); + details::ShareVarsFromScope(dout_vars, dout_var_names, *global_block, + &scope); // Debug info: scope info when run end VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); @@ -307,10 +316,11 @@ class RunProgramGradOpKernel : public framework::OpKernel { "least one sub scope.")); auto &scope = *(global_inner_scope->kids().front()); + auto *global_block = ctx.Attr("global_block"); if (end_op_index > start_op_index) { // Step 2. prepare executor and scope - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad*/ true, program_id, &scope); @@ -341,8 +351,10 @@ class RunProgramGradOpKernel : public framework::OpKernel { } // Step 4. get outputs - details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, &scope); - details::ShareVarsFromScope(param_grad_vars, param_grad_names, &scope); + details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, + *global_block, &scope); + details::ShareVarsFromScope(param_grad_vars, param_grad_names, + *global_block, &scope); // Step5. drop current scope global_inner_scope->DeleteScope(&scope); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 2381719020869..744a9b137f622 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -12,11 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include - -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 2f3e4c9ba88c3..32daa8c3934ae 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,12 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Seed Operator. )DOC"); @@ -55,3 +61,15 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( seed, ops::CPUSeedKernel); + +/* ========================== register checkpoint ===========================*/ +REGISTER_OP_VERSION(seed) + .AddCheckpoint( + R"ROC( + Upgrade seed add a new attribute [force_cpu])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "force_cpu", + "If true, Force fill output variable to cpu." + "memory. Otherwise, fill output variable to the running " + "device", + false)); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index c84407ba52dfd..4593b88019621 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/seed_op.h" namespace paddle { @@ -20,10 +21,10 @@ namespace operators { template class GPUSeedKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); int user_seed = context.Attr("seed"); + auto force_cpu = context.Attr("force_cpu"); std::random_device rnd; int seed; if (user_seed != 0) { @@ -31,11 +32,24 @@ class GPUSeedKernel : public framework::OpKernel { } else { seed = rnd(); } - auto target_gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, - sizeof(int), stream); + + bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(context.GetPlace()); + out->mutable_data(platform::CPUPlace()); + math::SetConstant functor; + functor(reinterpret_cast(dev_ctx), + out, static_cast(seed)); + } else { + auto *out_data = out->mutable_data(context.GetPlace()); + auto target_gpu_place = + BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, + sizeof(int), stream); + } } }; diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index f8b513fca4824..671f397d4eaff 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -14,6 +14,7 @@ #pragma once #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 3a8d81920f262..e7b124d5bddd6 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -1,8 +1,11 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -10,291 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/set_value_op.h" -#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/npu_op_runner.h" -#include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { -template -class SetValueNPUKernel : public framework::OpKernel { - private: - using Vector_Int64 = std::vector; - void GetNPUStartEndSteps(const Vector_Int64& start, const Vector_Int64& end, - const Vector_Int64& steps, const Vector_Int64& axes, - const framework::DDim& in_dim, - std::vector>& output) const { - int rank = in_dim.size(); - for (int i = 0; i < rank; ++i) { - int axis_size = in_dim[i]; - auto iter = find(axes.begin(), axes.end(), i); - if (iter != axes.end()) { - int idx = iter - axes.begin(); - output[0].push_back(start[idx]); // set as the same as raw input - output[1].push_back(end[idx]); - output[2].push_back(steps[idx]); - } else { - output[0].push_back(0); // begin 0 - output[1].push_back(axis_size); // end = last one - output[2].push_back(1); // step = 1 - } - } - } - - inline std::vector MininumPadNumberMakeSureLastDimGT8( - const std::vector>& npu_slice) const { - int rank = npu_slice[0].size(); - int last_dim_start = npu_slice[0][rank - 1]; - int last_dim_end = npu_slice[1][rank - 1]; - int last_dim_step = npu_slice[2][rank - 1]; - int min_end = last_dim_start + last_dim_step * min_last_dim_value_; - int raw_last_dim_len = (last_dim_end - last_dim_start) / last_dim_step; - return std::vector({std::max(0, min_end - last_dim_end), - min_last_dim_value_ - raw_last_dim_len}); - } - - inline void TileTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - VLOG(4) << "start to tile tensor function, which calls the npu operator " - "TileWithAxis"; - // UNSQUEEZE last dim + TILE last dim * min_last_dim_value_ - Tensor reshape_tensor; - auto reshape_dims = framework::vectorize(input->dims()); - reshape_dims.push_back(1); - reshape_tensor.ShareDataWith(*input); - reshape_tensor.Resize(framework::make_ddim(reshape_dims)); - - auto output_dims = framework::vectorize(input->dims()); - output_dims.push_back(min_last_dim_value_); - output->mutable_data(framework::make_ddim(output_dims), ctx->GetPlace()); - - framework::NPUAttributeMap attr; - attr["axis"] = static_cast(reshape_dims.size() - 1); - attr["tiles"] = min_last_dim_value_; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("TileWithAxis", {reshape_tensor}, {*output}, attr).Run(stream); - } - - inline void BroadcastToD(const framework::ExecutionContext* ctx, - const Tensor* input, - const std::vector* shape, - Tensor* output) const { - VLOG(4) << "Start BroadCast To"; - auto new_shape = std::vector(shape->begin(), shape->end()); - output->mutable_data(framework::make_ddim(new_shape), ctx->GetPlace()); - framework::NPUAttributeMap attr; - attr["shape"] = new_shape; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("BroadcastToD", {*input}, {*output}, attr).Run(stream); - } - - inline void CropTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - auto out_dims = output->dims(); - auto in_dims = input->dims(); - int rank = in_dims.size(); - in_dims[rank - 1] = 1; - output->Resize(in_dims); // unsqueeze output -> [..., 1] - framework::NPUAttributeMap attr; - attr["axis"] = 0; - attr["offsets"] = std::vector(rank, 0); - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("Crop", {*input, *output}, {*output}, attr).Run(stream); - output->Resize(out_dims); // restore it - } - - void SliceAssignNPU(const framework::ExecutionContext* ctx, - const Tensor* value_tensor, Vector_Int64& start, - Vector_Int64& end, Vector_Int64& steps, - Vector_Int64& axes, Tensor* assigned_tensor) const { - // must ensure assigned_tensor and value_tensor have the same shape - // not support steps < 0 - // output is also the assigned_tensor. - VLOG(4) << "start function SliceAssignND"; - auto stream = - ctx->template device_context() - .stream(); - for (size_t i = 0; i < steps.size(); ++i) { - PADDLE_ENFORCE_GT(steps[i], 0, - platform::errors::InvalidArgument( - "Currently NPU set_value operator doesn't support " - "negative steps, but got %d as step", - steps[i])); - } - std::vector> npu_slice(3); - GetNPUStartEndSteps(start, end, steps, axes, assigned_tensor->dims(), - npu_slice); - auto tile_numbers = MininumPadNumberMakeSureLastDimGT8(npu_slice); - int assigned_tensor_tile_number = tile_numbers[0]; - int value_tensor_tile_number = tile_numbers[1]; - VLOG(4) << "tile number is : " << assigned_tensor_tile_number << " " - << value_tensor_tile_number; - - Tensor tiled_assigned_tns, tiled_value_tns; - if (assigned_tensor_tile_number > 0) { - TileTensor(ctx, assigned_tensor, &tiled_assigned_tns); - TileTensor(ctx, value_tensor, &tiled_value_tns); - // output have different shape, so use a tmp variable before_crop_output; - // add last dim = min_last_dim_value_ in slice - npu_slice[0].push_back(0); - npu_slice[1].push_back(min_last_dim_value_); - npu_slice[2].push_back(1); - } - - framework::NPUAttributeMap attr_input; - attr_input["begin"] = - std::vector(npu_slice[0].begin(), npu_slice[0].end()); - attr_input["end"] = - std::vector(npu_slice[1].begin(), npu_slice[1].end()); - attr_input["strides"] = - std::vector(npu_slice[2].begin(), npu_slice[2].end()); - attr_input["begin_mask"] = 0; - attr_input["end_mask"] = 0; - attr_input["ellipsis_mask"] = 0; - attr_input["new_axis_mask"] = 0; - attr_input["shrink_axis_mask"] = 0; - if (assigned_tensor_tile_number > 0) { - NpuOpRunner("StridedSliceAssignD", {tiled_assigned_tns, tiled_value_tns}, - {tiled_assigned_tns}, attr_input) - .Run(stream); // Remember, set output = input, and this op will - // change the input value. - } else { - NpuOpRunner("StridedSliceAssignD", {*assigned_tensor, *value_tensor}, - {*assigned_tensor}, attr_input) - .Run(stream); - } - if (assigned_tensor_tile_number > 0) { - CropTensor(ctx, &tiled_assigned_tns /*initialzied*/, - assigned_tensor /*initalized*/); - } - } - - void ModifyAxesAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& axes_to_modify) const { - if (none_axes.empty()) return; - auto none_axes_copy = none_axes; - sort(none_axes_copy.begin(), none_axes_copy.end()); - for (size_t i = 0; i < axes_to_modify.size(); ++i) { - int axis = axes_to_modify[i]; - auto upper = - upper_bound(none_axes_copy.begin(), none_axes_copy.end(), axis); - // Example: none_axes = [1,3,4,5,7] - // axis = 4 - // find the element number less or equal than 4, which is - // 3(1,3,4) - // axis becomes 4 + 3 = 7 ; - axes_to_modify[i] = axis + (upper - none_axes_copy.begin()); - } - } - - void UnsqueezeAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& slice_dims) const { - // note : axes will change, because new axes inserted. - // sum array to modify the axes. because more simply - if (none_axes.empty()) return; - Vector_Int64 slice_dims_with_none; - size_t none_axes_cur = 0; - for (size_t i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= static_cast(i)) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims_with_none.push_back(slice_dims[i]); - } - // if the none_axes.size() > slice_dims.size(), append 1 after last dim - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims = slice_dims_with_none; - } +using NPUDeviceContext = platform::NPUDeviceContext; - void ModiftyDimsAccordingNoneAndDecrease(Vector_Int64& slice_dim, - Vector_Int64& value_dim, - Vector_Int64& axes, - Vector_Int64& none_axes, - Vector_Int64& dec_axes) const { - // change the value of slice_dim, value_dim, start, end, steps, axes by none - // and decrease axes - // after change, this values can be passed to SliceAssignNPU() directly. - - // Modity Slice Dim - UnsqueezeAccordingNoneAxes(none_axes, slice_dim); - ModifyAxesAccordingNoneAxes(none_axes, dec_axes); - ModifyAxesAccordingNoneAxes(none_axes, axes); - // Modity Value Dim by new slice dim - auto slice_dim_reverse = slice_dim; - auto value_dim_reverse = value_dim; - std::reverse(slice_dim_reverse.begin(), slice_dim_reverse.end()); - std::reverse(value_dim_reverse.begin(), value_dim_reverse.end()); - - Vector_Int64 new_value_dim; - PADDLE_ENFORCE_GE( - slice_dim.size(), value_dim.size(), - platform::errors::InvalidArgument("The size of expanded slice_dim(%d) " - "must greater than the value_dim(%d)", - slice_dim.size(), value_dim.size())); +template +class SetValueNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("Input"); + auto* value_tensor = ctx.Input("ValueTensor"); + auto* out = ctx.Output("Out"); - size_t value_cur = 0; - size_t rank = slice_dim.size(); - for (size_t i = 0; i < rank; ++i) { - auto& xsize = slice_dim_reverse[i]; - if (value_cur >= value_dim_reverse.size()) { - new_value_dim.push_back(1); - continue; - } - auto& vsize = value_dim_reverse[value_cur]; - auto it = find(dec_axes.begin(), dec_axes.end(), rank - 1 - i); - if (it != dec_axes.end()) { - // found, insert one dim ; - PADDLE_ENFORCE_EQ(xsize, 1, platform::errors::InvalidArgument( - "The dims refered by decrease axes is " - "not equal to 1, some wrongs happen")); - new_value_dim.push_back(1); - continue; - } - if (xsize == vsize || vsize == 1) { - new_value_dim.push_back(vsize); - ++value_cur; - continue; - } - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - for (; value_cur < value_dim_reverse.size(); ++value_cur) { - if (value_dim_reverse[value_cur] != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - } - std::reverse(new_value_dim.begin(), new_value_dim.end()); - value_dim = new_value_dim; - return; - } + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + auto steps_tensor_list = ctx.MultiInput("StepsTensorList"); - public: - void Compute(const framework::ExecutionContext& ctx) const override { - VLOG(2) << "Start Set Value Npu Kernel"; - auto* in = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); auto axes = ctx.Attr>("axes"); auto starts = ctx.Attr>("starts"); auto ends = ctx.Attr>("ends"); @@ -302,17 +39,6 @@ class SetValueNPUKernel : public framework::OpKernel { auto shape = ctx.Attr>("shape"); auto decrease_axes = ctx.Attr>("decrease_axes"); auto none_axes = ctx.Attr>("none_axes"); - auto dtype = in->type(); - - if (dtype == framework::proto::VarType::FP64 || - dtype == framework::proto::VarType::INT64 || - dtype == framework::proto::VarType::BOOL) { - auto value_type_name = GetValueName(dtype); - PADDLE_THROW(platform::errors::InvalidArgument( - "The NPU setvalue kernel currently only support FLOAT32 and INT32, " - "but got type: %s", - value_type_name.data())); - } if (!starts_tensor_list.empty()) { starts = GetDataFromTensorList(starts_tensor_list); @@ -327,65 +53,137 @@ class SetValueNPUKernel : public framework::OpKernel { auto in_dims = in->dims(); CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto place = ctx.GetPlace(); + auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); + + auto slice_dims_for_assign = decrease_slice_dims; + if (!none_axes.empty()) { + std::vector slice_dims_with_none; + + size_t none_axes_cur = 0, decrease_axes_cur = 0; + for (int i = 0; i < slice_dims.size(); ++i) { + while (none_axes_cur < none_axes.size() && + none_axes[none_axes_cur] <= i) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } + if (decrease_axes_cur < decrease_axes.size() && + decrease_axes[decrease_axes_cur] == i) { + decrease_axes_cur++; + } else { + slice_dims_with_none.push_back(slice_dims[i]); + } + } + while (none_axes_cur < none_axes.size()) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } - // aforementioned code is copyed directly from CPU kernel. - // (@xiongkun03) the following is redesigned by xiongkun. because NPU can do - // step slice assignment. so we deal with all none_axes and decrease_axes - // here. - // 1. we insert 1 into assigned_tensor_shape according to none_axes; - // 2. we insert 1 into value_tensor_shape(value tensor) according to - // decrease_axes; - // 3. we reshape back the assigned_tensor. and return it. - // note : we use a tmp_value_tensor as value_tns. it shares data with - // value_tensor; - // I believe the logic is more simple than cpu logic. + slice_dims_for_assign = framework::make_ddim(slice_dims_with_none); + } + + TensorCopy(*in, ctx.GetPlace(), out); + + auto starts_indices = std::vector(in_dims.size(), 0); + auto ends_indices = std::vector(in_dims.size(), 0); + auto strides_indices = std::vector(in_dims.size(), 0); + + for (int i = 0; i < in_dims.size(); ++i) { + starts_indices[i] = 0; + ends_indices[i] = slice_dims[i]; + strides_indices[i] = 1; + } + for (size_t i = 0; i < axes.size(); i++) { + int axis_index = axes[i]; + starts_indices[axis_index] = starts[i]; + ends_indices[axis_index] = ends[i]; + strides_indices[axis_index] = steps[i]; + } + + int64_t stride_step = framework::product(in_dims); + std::vector index_indices(1, 0); + for (size_t i = 0; i < strides_indices.size(); ++i) { + auto index_size = index_indices.size(); + stride_step /= in_dims[i]; + for (size_t j = 0; j < index_size; ++j) { + auto start_index = *index_indices.begin(); + if (strides_indices[i] > 0) { + for (int64_t k = starts_indices[i]; k < ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } else { + for (int64_t k = starts_indices[i]; k > ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } + index_indices.erase(index_indices.begin()); + } + } - TensorCopy(*in, place, out); - Tensor value_t(dtype); + PADDLE_ENFORCE_EQ( + static_cast(index_indices.size()), + framework::product(slice_dims_for_assign), + platform::errors::InvalidArgument( + "OP(set_value) error index indices and value update not match ")); - if (value_tensor == nullptr) { + Tensor value_t(in->type()); + if (value_tensor != nullptr) { + value_t.ShareDataWith(*value_tensor); + } else { auto value_dims = framework::make_ddim(shape); - value_t.mutable_data(value_dims, place); - auto value_name = GetValueName(dtype); + CheckIsDimsMatch(slice_dims_for_assign, value_dims); + + value_t.mutable_data(value_dims, ctx.GetPlace()); + auto value_name = GetValueName(in->type()); CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); value_t.Resize(value_dims); } - const Tensor* value_tensor_ptr = - (value_tensor == nullptr) ? &value_t : value_tensor; - auto value_dims_vec = framework::vectorize(value_tensor_ptr->dims()); - auto slice_dims_vec = framework::vectorize(slice_dims); - auto in_dims_vec = framework::vectorize(in_dims); - - UnsqueezeAccordingNoneAxes(none_axes, in_dims_vec); - ModiftyDimsAccordingNoneAndDecrease(slice_dims_vec, value_dims_vec, axes, - none_axes, - decrease_axes); // Modify and Check + auto stream = ctx.template device_context().stream(); - Tensor reshaped_value_tensor, broadcast_value_tensor; - reshaped_value_tensor.ShareDataWith(*value_tensor_ptr); - reshaped_value_tensor.Resize(framework::make_ddim(value_dims_vec)); - - BroadcastToD(&ctx, &reshaped_value_tensor, &slice_dims_vec, - &broadcast_value_tensor /*inner function initialized*/); + Tensor value_temp(in->type()); + if (slice_dims_for_assign == value_t.dims()) { + value_temp.ShareDataWith(value_t); + } else { + value_temp.Resize(slice_dims_for_assign); + value_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(value_t) + .AddInput(framework::vectorize(slice_dims_for_assign)) + .AddOutput(value_temp) + .Run(stream); + } - out->Resize(framework::make_ddim(in_dims_vec)); - SliceAssignNPU(&ctx, &broadcast_value_tensor, starts, ends, steps, axes, - out); - out->Resize(in_dims); // Reshape Back + int64_t input_numel = framework::product(in_dims); + int64_t index_numel = index_indices.size(); + + Tensor in_temp, out_temp, val_temp; + in_temp.ShareDataWith(*in); + out_temp.ShareDataWith(*out); + val_temp.ShareDataWith(value_temp); + in_temp.Resize(framework::make_ddim({input_numel})); + out_temp.Resize(framework::make_ddim({input_numel})); + val_temp.Resize(framework::make_ddim({index_numel})); + + NpuOpRunner runner; + runner.SetType("ScatterUpdate") + .AddInput(in_temp) + .AddInput(std::move(index_indices)) + .AddInput(val_temp) + .AddOutput(out_temp) + .Run(stream); } - - private: - const int min_last_dim_value_ = - 32 / sizeof(T); // 16 for float16 , 8 for float32 }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - set_value, ops::SetValueNPUKernel, - ops::SetValueNPUKernel) + +REGISTER_OP_NPU_KERNEL(set_value, ops::SetValueNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SetValueNPUKernel, +#endif + ops::SetValueNPUKernel) diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 1084eadc55c5b..52351a98bce37 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -12,18 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the Licnse. */ -#include -#include - -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, const std::vector starts, const std::vector ends, @@ -54,7 +50,7 @@ void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, } } -template +template class SliceNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -128,17 +124,14 @@ class SliceNPUKernel : public framework::OpKernel { UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); + auto stream = ctx.template device_context().stream(); const auto& runner = NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {"size", size}}); - - auto stream = - ctx.template device_context() - .stream(); runner.Run(stream); } }; -template +template class SliceGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -181,12 +174,37 @@ class SliceGradNPUKernel : public framework::OpKernel { paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); } + Tensor tmp_dout; + tmp_dout.ShareDataWith(*dout); + auto out_dims = dout->dims(); + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto decrease_size = decrease_axis.size(); + if (decrease_size > 0) { + if (decrease_size == static_cast(in_dims.size())) { + out_dims = framework::make_ddim(std::vector(decrease_size, 1)); + } else { + std::vector origin_out_shape(out_dims.size() + decrease_size, -1); + for (size_t i = 0; i < decrease_size; ++i) { + origin_out_shape[decrease_axis[i]] = 1; + } + int index = 0; + for (size_t i = 0; i < origin_out_shape.size(); ++i) { + if (origin_out_shape[i] == -1) { + origin_out_shape[i] = out_dims[index]; + ++index; + } + } + out_dims = framework::make_ddim(origin_out_shape); + } + tmp_dout.Resize(out_dims); + } + dinput->mutable_data(ctx.GetPlace()); auto stream = ctx.template device_context() .stream(); const auto& runner = - NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); + NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}}); runner.Run(stream); } }; @@ -196,15 +214,13 @@ class SliceGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - slice, ops::SliceNPUKernel, - ops::SliceNPUKernel, - ops::SliceNPUKernel); - -REGISTER_OP_NPU_KERNEL( - slice_grad, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel); +REGISTER_OP_NPU_KERNEL(slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SliceNPUKernel, +#endif + ops::SliceNPUKernel); + +REGISTER_OP_NPU_KERNEL(slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 0c2d39e7519ef..78e813edda930 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" -#include -#include -#include -#include #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -54,8 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, " "where labels is ont-hot." "Currently, the tensor is generated and used in npu kernel only. ") - .AsIntermediate() - .AsDispensable(); + .AsIntermediate(); #endif AddOutput("Loss", "(Tensor, default: Tensor), A tensor in same shape with " @@ -136,6 +131,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true, platform::errors::InvalidArgument( "Output(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), true, + platform::errors::InvalidArgument( + "Output(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasOutput("Loss"), true, platform::errors::InvalidArgument("Output(Loss) should be not null.")); @@ -225,6 +225,11 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, platform::errors::InvalidArgument( "Input(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), true, + platform::errors::InvalidArgument( + "Input(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasInput("Label"), true, platform::errors::InvalidArgument("Input(Label) should be not null.")); diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc new file mode 100644 index 0000000000000..9b6bc1b629045 --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cc @@ -0,0 +1,193 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class SparseAttentionOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Q", + "(Tensor), The input tensor of query in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "K", + "(Tensor), The input tensor of key in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "V", + "(Tensor), The input tensor of value in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput("Offset", + "(Tensor, default: Tensor), The input tensor of offset in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, target_len + 1]`."); + AddInput("Columns", + "(Tensor, default: Tensor), The input tensor of columns in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_num]`."); + AddOutput( + "Out", + "(Tensor), The output tensor of result in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddOutput("SparseDotSdd", + "(Tensor), The output tensor of result in SparseDotSdd step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddOutput("Softmax", + "(Tensor), The output tensor of result in Softmax step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddComment(R"DOC( + Compute the value of the sparse attention module. Its input value includes five tensors. + Q, K, and V represent query, key, and value in the Attention module, respectively. + The CSR format is used to represent the sparsity feature in the Attention module. + The CSR format contains two tensors, offset and columns. + )DOC"); + } +}; + +class SparseAttentionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("SparseDotSdd"), "Output", "SparseDotSdd", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "sparse_attention"); + + auto dims_q = ctx->GetInputDim("Q"); + auto dims_k = ctx->GetInputDim("K"); + auto dims_v = ctx->GetInputDim("V"); + auto dims_columns = ctx->GetInputDim("Columns"); + + PADDLE_ENFORCE_EQ(dims_q.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in query' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_k.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in key' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_v.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in value' shapes should be 4.")); + + auto batch_size = dims_q[0]; + auto num_heads = dims_q[1]; + auto M = dims_q[2]; + auto N = dims_q[3]; + auto sparse_nnz = dims_columns[2]; + ctx->SetOutputDim("Out", {batch_size, num_heads, M, N}); + ctx->SetOutputDim("SparseDotSdd", {batch_size, num_heads, sparse_nnz}); + ctx->SetOutputDim("Softmax", {batch_size, num_heads, sparse_nnz}); + ctx->ShareLoD("Q", "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "Q", "K"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class SparseAttentionOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("SparseDotSdd"), "Input", "SparseDotSdd", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Softmax"), "Input", "Softmax", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "sparse_attention_grad"); + + auto x_grad_name = framework::GradVarName("Q"); + auto y_grad_name = framework::GradVarName("K"); + auto z_grad_name = framework::GradVarName("V"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Q")); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("K")); + } + if (ctx->HasOutput(z_grad_name)) { + ctx->SetOutputDim(z_grad_name, ctx->GetInputDim("V")); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class SparseAttentionGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sparse_attention_grad"); + op->SetInput("Q", this->Input("Q")); + op->SetInput("K", this->Input("K")); + op->SetInput("V", this->Input("V")); + op->SetInput("Offset", this->Input("Offset")); + op->SetInput("Columns", this->Input("Columns")); + op->SetInput("SparseDotSdd", this->Output("SparseDotSdd")); + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Q"), this->InputGrad("Q")); + op->SetOutput(framework::GradVarName("K"), this->InputGrad("K")); + op->SetOutput(framework::GradVarName("V"), this->InputGrad("V")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sparse_attention, ops::SparseAttentionOp, + ops::SparseAttentionOpMaker, + ops::SparseAttentionGradOpMaker, + ops::SparseAttentionGradOpMaker); + +REGISTER_OPERATOR(sparse_attention_grad, ops::SparseAttentionOpGrad); diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu new file mode 100644 index 0000000000000..88ee8999c5f4a --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -0,0 +1,537 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#if defined(PADDLE_WITH_CUDA) +#include "paddle/fluid/platform/dynload/cusparse.h" +#endif + +namespace ops = paddle::operators; +namespace plf = paddle::platform; + +namespace paddle { +namespace operators { + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template +__device__ __forceinline__ void WarpReduceSum(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T sum_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = sum[i] + sum_val; + } + } +} + +template +__device__ __forceinline__ void WarpReduceMax(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T max_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = max(sum[i], max_val); + } + } +} + +template +__global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale, + const T* kp_mask, const T* attn_mask, + const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T attndata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read kp mask + T cur_kp_mask = (kp_mask == nullptr) ? 0 : kp_mask[cur_row]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + T* attnptr = nullptr; + if (attn_mask != nullptr) { + const T* attnptr = attn_mask + cur_block_row * num_rows; + } + const int* colindex = layout_colindex + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + if ((attnptr != nullptr) && + std::abs(attnptr[colindex[cur_block_col]]) < + std::numeric_limits::epsilon()) { + srcdata[cur_reg_index] = + -std::numeric_limits::infinity() * scale + cur_kp_mask; + } else { + srcdata[cur_reg_index] = scale * srcptr[cur_block_col] + cur_kp_mask; + } + } else { + srcdata[cur_reg_index] = -std::numeric_limits::infinity(); + } + } + + // max value + T max_value = srcdata[0]; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 1; it < kIteration; ++it) { + max_value = (max_value > srcdata[it]) ? max_value : srcdata[it]; + } + WarpReduceMax(&max_value); + + // exp sum + T sum = 0; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + srcdata[it] = std::exp(srcdata[it] - max_value); + sum += srcdata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* softmaxptr = softmax + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + softmaxptr[cur_block_col] = srcdata[cur_reg_index] / sum; + } + } + } +} + +template +__global__ void BlockSparseSoftmaxBackward(T* dst, const T* grad, const T* src, + T scale, const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T graddata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + const T* gradptr = grad + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + srcdata[cur_reg_index] = srcptr[cur_block_col]; + graddata[cur_reg_index] = gradptr[cur_block_col]; + } else { + srcdata[cur_reg_index] = 0; + graddata[cur_reg_index] = 0; + } + } + + T sum = 0; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + sum += srcdata[it] * graddata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* dstptr = dst + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + dstptr[cur_block_col] = + scale * srcdata[cur_reg_index] * (graddata[cur_reg_index] - sum); + } + } + } +} + +using Tensor = framework::Tensor; +/* +input: sparse C in CSR format (num_rows,num_rows) +output: sparse C after softmax operation +*/ +template +void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* input, Tensor* output, const int blocksize, + const int num_rows, const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* input_data = input->data(); + T* output_data = output->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxForward<<>>( + output_data, input_data, scaling, nullptr, nullptr, offset_data, + columns_data, num_rows); +} + +template +void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* dx, const Tensor* dout, const Tensor* out, + const int blocksize, const int num_rows, + const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* dx_data = dx->data(); + const T* dout_data = dout->data(); + const T* out_data = out->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxBackward<<>>( + dx_data, dout_data, out_data, scaling, offset_data, columns_data, + num_rows); +} + +using VarType = framework::proto::VarType; +inline cudaDataType_t GetGpuType(const VarType::Type data_type) { + if (data_type == VarType::FP32) { + return CUDA_R_32F; + } else if (data_type == VarType::FP64) { + return CUDA_R_64F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support tensor type in sparse_attention OP: %s", + framework::DataTypeToString(data_type))); + } +} + +inline cusparseOperation_t GetTransposeOperation(const bool transpose) { + if (transpose) { + return CUSPARSE_OPERATION_TRANSPOSE; + } else { + return CUSPARSE_OPERATION_NON_TRANSPOSE; + } +} + +void CusparseDestroy(cusparseDnMatDescr_t* dn_mat_first, + cusparseDnMatDescr_t* dn_mat_second, + cusparseSpMatDescr_t* sp_mat) { + platform::dynload::cusparseDestroyDnMat(*dn_mat_first); + platform::dynload::cusparseDestroyDnMat(*dn_mat_second); + platform::dynload::cusparseDestroySpMat(*sp_mat); +} + +/* +input: dense A (num_rows,num_cols), dense B (num_rows,num_cols) +output: sparse C in CSR format (num_rows,num_rows) +*/ +template +void DotSdd(const platform::CUDADeviceContext& ctx, const Tensor* a, + const Tensor* b, const Tensor* c_offset, const Tensor* c_columns, + Tensor* c_value, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const T* a_data = a->data(); + const T* b_data = b->data(); + const int* c_offset_data = c_offset->data(); + const int* c_columns_data = c_columns->data(); + T* c_value_data = c_value->data(); + + cudaDataType_t gpu_type = GetGpuType(c_value->type()); + cusparseHandle_t handle = nullptr; + cusparseDnMatDescr_t mat_a, mat_b; + cusparseSpMatDescr_t mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create dense matrix A + platform::dynload::cusparseCreateDnMat(&mat_a, num_rows, num_cols, num_cols, + const_cast(a_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create sparse matrix C in CSR format + int c_nnz = c_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_c, num_rows, num_rows, c_nnz, const_cast(c_offset_data), + const_cast(c_columns_data), c_value_data, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, gpu_type); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + platform::dynload::cusparseSDDMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SDDMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSDDMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SDDMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_a, &mat_b, &mat_c); + platform::dynload::cusparseDestroy(handle); +} + +/* +input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols) +output: dense C (num_rows,num_cols) +*/ +template +void DotDsd(const platform::CUDADeviceContext& ctx, const Tensor* a_offset, + const Tensor* a_columns, const Tensor* a_value, const Tensor* b, + Tensor* c, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const int* a_offset_data = a_offset->data(); + const int* a_columns_data = a_columns->data(); + const T* a_value_data = a_value->data(); + const T* b_data = b->data(); + T* c_data = c->data(); + + cudaDataType_t gpu_type = GetGpuType(c->type()); + cusparseHandle_t handle = nullptr; + cusparseSpMatDescr_t mat_a; + cusparseDnMatDescr_t mat_b, mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create sparse matrix A in CSR format + int a_nnz = a_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_a, num_rows, num_rows, a_nnz, const_cast(a_offset_data), + const_cast(a_columns_data), const_cast(a_value_data), + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix C + platform::dynload::cusparseCreateDnMat(&mat_c, num_rows, num_cols, num_cols, + c_data, gpu_type, CUSPARSE_ORDER_ROW); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + // allocate an external buffer if needed + platform::dynload::cusparseSpMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SPMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSpMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SPMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_b, &mat_c, &mat_a); + platform::dynload::cusparseDestroy(handle); +} + +std::vector GetSplitTensor(Tensor* input) { + auto dims = input->dims(); + int batch_size = dims[0]; + int num_heads = dims[1]; + std::vector new_dims(dims.size() - 1); + new_dims[0] = batch_size * num_heads; + for (int i = 1; i < new_dims.size(); i++) { + new_dims[i] = dims[i + 1]; + } + input->Resize(framework::make_ddim(new_dims)); + return input->Split(1, 0); +} + +template +class SparseAttentionCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto output_ptr = ctx.Output("Out"); + output_ptr->mutable_data(ctx.GetPlace()); + auto sparse_dot_sdd_ptr = ctx.Output("SparseDotSdd"); + sparse_dot_sdd_ptr->mutable_data(ctx.GetPlace()); + auto softmax_ptr = ctx.Output("Softmax"); + softmax_ptr->mutable_data(ctx.GetPlace()); + + auto output = *output_ptr; + auto result_sdd = *sparse_dot_sdd_ptr; + auto result_softmax = *softmax_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector result_sdd_lists = GetSplitTensor(&result_sdd); + std::vector result_softmax_lists = GetSplitTensor(&result_softmax); + std::vector output_lists = GetSplitTensor(&output); + + const auto& dev_ctx = ctx.cuda_device_context(); + const int iter_num = batch_size * num_heads; + for (int i = 0; i < iter_num; i++) { + DotSdd(dev_ctx, &query_lists[i], &key_lists[i], + &offset_lists[i], &columns_lists[i], + &result_sdd_lists[i], M, N, false, true); + + SparseSoftmaxForward( + dev_ctx, &offset_lists[i], &columns_lists[i], &result_sdd_lists[i], + &result_softmax_lists[i], 1, M, N); + + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &result_softmax_lists[i], &value_lists[i], + &output_lists[i], M, N, false, false); + } + } +}; + +template +class SparseAttentionGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto sparse_dot_sdd = *ctx.Input("SparseDotSdd"); + auto softmax = *ctx.Input("Softmax"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto* dquery_ptr = ctx.Output(framework::GradVarName("Q")); + auto* dkey_ptr = ctx.Output(framework::GradVarName("K")); + auto* dvalue_ptr = ctx.Output(framework::GradVarName("V")); + dquery_ptr->mutable_data(ctx.GetPlace()); + dkey_ptr->mutable_data(ctx.GetPlace()); + dvalue_ptr->mutable_data(ctx.GetPlace()); + auto dquery = *dquery_ptr; + auto dkey = *dkey_ptr; + auto dvalue = *dvalue_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector sparse_dot_sdd_lists = GetSplitTensor(&sparse_dot_sdd); + std::vector softmax_lists = GetSplitTensor(&softmax); + std::vector dout_lists = GetSplitTensor(&dout); + std::vector dquery_lists = GetSplitTensor(&dquery); + std::vector dkey_lists = GetSplitTensor(&dkey); + std::vector dvalue_lists = GetSplitTensor(&dvalue); + + const int iter_num = batch_size * num_heads; + const auto& dev_ctx = ctx.cuda_device_context(); + for (int i = 0; i < iter_num; i++) { + // dValue = transpose(result_softmax) * dOut + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &softmax_lists[i], &dout_lists[i], + &dvalue_lists[i], M, N, true, false); + + // dSoftmax = dOut * transpose(Value) + int nnz_num = columns.dims()[0]; + Tensor dsoftmax; + dsoftmax.Resize({nnz_num}); + dsoftmax.mutable_data(ctx.GetPlace()); + DotSdd(dev_ctx, &dout_lists[i], &value_lists[i], + &offset_lists[i], &columns_lists[i], &dsoftmax, + M, N, false, true); + + // dSparseDotSdd = dSoftmax * softmax'(SparseDotSdd) + Tensor dsparse_dot_sdd; + dsparse_dot_sdd.Resize({nnz_num}); + dsparse_dot_sdd.mutable_data(ctx.GetPlace()); + SparseSoftmaxBackward( + dev_ctx, &offset_lists[i], &columns_lists[i], &dsparse_dot_sdd, + &dsoftmax, &softmax_lists[i], 1, M, N); + + // dQuery = dSparseDotSdd * Key + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &key_lists[i], + &dquery_lists[i], M, N, false, false); + + // dKey = transpose(dSparseDotSdd) * Query + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &query_lists[i], + &dkey_lists[i], M, N, true, false); + } + } +}; + +} // namespace operators +} // namespace paddle +REGISTER_OP_CUDA_KERNEL( + sparse_attention, + ops::SparseAttentionCUDAKernel, + ops::SparseAttentionCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + sparse_attention_grad, + ops::SparseAttentionGradCUDAKernel, + ops::SparseAttentionGradCUDAKernel); diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index 9aa5ca39d737e..24dffaad41b5f 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -83,9 +83,7 @@ static inline std::string get_cufft_error_info(cufftResult error) { } static inline void CUFFT_CHECK(cufftResult error) { - if (error != CUFFT_SUCCESS) { - PADDLE_THROW(platform::errors::External(get_cufft_error_info(error))); - } + PADDLE_ENFORCE_CUDA_SUCCESS(error); } // This struct is used to easily compute hashes of the @@ -413,6 +411,7 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, ? framework::ToRealType(input.type()) : input.type(); auto fft_type = GetFFTTransformType(input.type(), output.type()); + PlanKey Key(framework::vectorize(input.dims()), framework::vectorize(output.dims()), signal_size, fft_type, value_type); diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 8894ca650de03..de30eab25f3cf 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -113,13 +113,13 @@ class SqueezeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -140,13 +140,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -241,13 +241,13 @@ class Squeeze2Op : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -287,13 +287,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index d592c62d499b3..9ba7c9a3062a0 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -96,6 +96,20 @@ struct PowFunctor { float exp_; }; +template +struct RealMulComplexFunctor { + // x: complex number (a+bj) + // y: complex number (c+0j) pretend to be a real number + // out: complex number (ac+bcj) + inline HOSTDEVICE T operator()(T x, T y) { + PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument( + "The image part of y must to be 0" + "but got [%d]", + y.imag)); + return platform::complex>(x.real * y.real, x.imag * y.real); + } +}; + static std::vector GetBroadcastShape(InTensors ins) { PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument( "GetBroadcastShape Receive 2 tensors" @@ -286,6 +300,45 @@ struct DeviceIndependenceTensorOperations { for_range(DiagFunctor(x.data(), x.numel(), output)); return ret; } + + // batch_diag for CPU only + Tensor BatchDiag(const Tensor& x, int batch) { + Tensor out; + auto* x_data = x.data>(); + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + + auto x_dims = x.dims(); + int num_dims = x_dims.size(); + std::vector out_shape; + + for (int i = 0; i < num_dims - 1; ++i) { + out_shape.push_back(x.dims()[i]); + } + out.Resize(framework::make_ddim(out_shape)); + int order = x.dims()[num_dims - 1]; + int stride_out = order * order; + int stride_in = order + 1; + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < order; ++j) { + out_data[i * order + j] = x_data[stride_out * i + stride_in * j]; + } + } + return out; + } + + // a complex number x times a real number y, which is represented as (a+0j) + Tensor RealMulComplex(const Tensor& x, const Tensor& y) { + framework::Tensor ret; + std::vector out_shape = GetBroadcastShape({&x, &y}); + ret.Resize(framework::make_ddim(out_shape)); + ElementwiseComputeEx, DeviceContext, T>( + context, &x, &y, -1, RealMulComplexFunctor(), &ret); + return ret; + } + framework::Tensor Div(const framework::Tensor& x, const framework::Tensor& y) { framework::Tensor ret; @@ -459,6 +512,19 @@ struct DeviceIndependenceTensorOperations { return out; } + Tensor Real(const Tensor& x) { + Tensor out; + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + auto* x_data = x.data(); + auto for_range = GetForRange(numel); + math::RealFunctor functor(x_data, out_data, numel); + for_range(functor); + return out; + } + Tensor DiagFill(const int m, const int n, const int num_lower_diags, const int num_upper_diags, const Tensor& scale, const Tensor& input) { diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc index ca3a5f957685d..a7d8fe01edd4c 100644 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ b/paddle/fluid/operators/top_k_op_npu.cc @@ -51,7 +51,9 @@ class TopkNPUKernel : public framework::OpKernel { indices->mutable_data(ctx.GetPlace()); // prepare assit - auto dim = input->dims().size(); + auto size = input->dims().size(); + // dim is the last dimension of input + auto dim = input->dims()[size - 1]; framework::Tensor assist_seq_tensor; assist_seq_tensor.Resize({2 * dim}); assist_seq_tensor.mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc index 035ad5f3f314a..7cc68e93c5d62 100644 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ b/paddle/fluid/operators/transpose_op_npu.cc @@ -27,9 +27,12 @@ class TransposeNPUKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); std::vector axis = ctx.Attr>("axis"); - framework::NPUAttributeMap attr_input = {{"perm", axis}}; out->mutable_data(ctx.device_context().GetPlace()); - const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*x) + .AddInput(std::move(axis)) + .AddOutput(*out); auto stream = ctx.template device_context() .stream(); @@ -51,9 +54,11 @@ class TransposeGradNPUKernel : public framework::OpKernel { reversed_axis[axis[i]] = i; } x_grad->mutable_data(ctx.GetPlace()); - framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; - const auto& runner = - NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(reversed_axis)) + .AddOutput(*x_grad); auto stream = ctx.template device_context() .stream(); @@ -72,11 +77,17 @@ REGISTER_OP_NPU_KERNEL( ops::TransposeNPUKernel, ops::TransposeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeNPUKernel, +#endif ops::TransposeNPUKernel, ops::TransposeNPUKernel); REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeGradNPUKernel, +#endif ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel); diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index 99793ecd244cf..66b0543771f4d 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -403,7 +403,10 @@ class UniqueKernel : public framework::OpKernel { bool return_index = context.Attr("return_index"); bool return_inverse = context.Attr("return_inverse"); bool return_counts = context.Attr("return_counts"); - + if (x->numel() == 0) { + out->mutable_data(context.GetPlace()); + return; + } if (axis_vec.empty()) { framework::VisitDataTypeTiny( data_type, diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h index 82118b692707f..cfd4d6bce8364 100644 --- a/paddle/fluid/operators/unstack_op.h +++ b/paddle/fluid/operators/unstack_op.h @@ -149,7 +149,7 @@ class UnStackKernel : public framework::OpKernel { dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); } auto dy_data = dy->data(); - + if (dy->numel() == 0) return; int pre = 1; for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; int total_num = dy->numel(); diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 2540170ed54fb..21213f9e6ff21 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -59,9 +59,14 @@ cc_library(cpu_info SRCS cpu_info.cc DEPS ${CPU_INFO_DEPS}) cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) IF(WITH_GPU) + nv_library(cuda_graph SRCS cuda_graph.cc DEPS enforce allocator_facade) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) nv_library(cuda_profiler SRCS cuda_profiler.cc DEPS enforce) + nv_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade cuda_graph) +ELSE() + cc_library(cuda_graph_with_memory_pool SRCS cuda_graph_with_memory_pool.cc DEPS device_context allocator_facade) ENDIF() + IF(WITH_ROCM) hip_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce monitor dynload_cuda) ENDIF() diff --git a/paddle/fluid/platform/collective_helper.cc b/paddle/fluid/platform/collective_helper.cc index a765f344daf8a..03359d932b5ab 100644 --- a/paddle/fluid/platform/collective_helper.cc +++ b/paddle/fluid/platform/collective_helper.cc @@ -148,7 +148,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( paddle::platform::errors::InvalidArgument( "dev ids = [%d], it should greater than 0.", dev_ids.size())); const int kDevices = dev_ids.size(); - VLOG(3) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices + VLOG(1) << "Begin CreateNCCLCommMultiTrainer. device number: " << kDevices << ", ntrainers: " << ntrainers << ", train_id: " << train_id << ", rind_id: " << ring_id; ncclComm_t comms[kDevices]; @@ -162,10 +162,10 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( #endif platform::dynload::ncclCommInitRank(comms + i, kDevices * ntrainers, *nccl_id, train_id * kDevices + i); - VLOG(3) << "ncclCommInitRank: " << i; + VLOG(1) << "ncclCommInitRank: " << i; } PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); - VLOG(3) << "nccl group end seccessss"; + VLOG(1) << "nccl group end seccessss"; } PADDLE_ENFORCE_EQ(comm_map_.count(ring_id), 0, platform::errors::InvalidArgument( @@ -174,7 +174,7 @@ void NCCLCommContext::CreateNCCLCommMultiTrainer( for (int i = 0; i < kDevices; ++i) { AssignNCCLComm(comms[i], kDevices * ntrainers, train_id * kDevices + i, dev_ids[i], ring_id); - VLOG(3) << "nccl communicator of train_id " << train_id * kDevices + i + VLOG(1) << "nccl communicator of train_id " << train_id * kDevices + i << " in ring " << ring_id << " has been created on device " << dev_ids[i]; } diff --git a/paddle/fluid/platform/cuda_graph.cc b/paddle/fluid/platform/cuda_graph.cc new file mode 100644 index 0000000000000..693a592799027 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/cuda_graph.h" + +namespace paddle { +namespace platform { + +std::unique_ptr CUDAGraph::capturing_graph_{nullptr}; + +void CUDAGraph::Reset() { + if (is_reset_) return; +#if CUDA_VERSION >= 10010 + if (graph_) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphDestroy(graph_)); + graph_ = nullptr; + } + if (exec_graph_) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphExecDestroy(exec_graph_)); + exec_graph_ = nullptr; + } +#endif + // callback should be called in reverse order because the latter added + // callback may rely on the former added callback. + for (auto iter = callbacks_.rbegin(); iter != callbacks_.rend(); ++iter) { + (*iter)(); + } + callbacks_.clear(); + is_reset_ = true; +} + +void CUDAGraph::Replay() { +#if CUDA_VERSION >= 10010 + PADDLE_ENFORCE_EQ(is_reset_, false, + errors::PermissionDenied( + "Cannot replay the CUDA Graph after reset is called.")); + PADDLE_ENFORCE_NOT_NULL(exec_graph_, + errors::PermissionDenied( + "CUDA Graph must be captured before replaying.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGraphLaunch(exec_graph_, stream_)); +#endif +} + +void CUDAGraph::BeginCapture(platform::CUDAPlace place, cudaStream_t stream, + cudaStreamCaptureMode mode) { + ThrowErrorIfNotSupportCUDAGraph(); + PADDLE_ENFORCE_EQ( + IsCapturing(), false, + errors::PermissionDenied("CUDA Graph can only captured one by one.")); + PADDLE_ENFORCE_NOT_NULL( + stream, errors::PermissionDenied( + "CUDA Graph cannot be captured in default CUDA stream 0.")); + capturing_graph_.reset(new CUDAGraph()); + capturing_graph_->place_ = place; + capturing_graph_->stream_ = stream; + + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamBeginCapture(capturing_graph_->stream_, mode)); + cudaStreamCaptureStatus status; + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamGetCaptureInfo( + capturing_graph_->stream_, &status, &(capturing_graph_->id_))); + PADDLE_ENFORCE_EQ(IsValidCapturing(), true, + platform::errors::PermissionDenied( + "CUDA Graph should not be invalidated.")); + VLOG(10) << "Begin to capture CUDA Graph with ID " << capturing_graph_->id_; +} + +std::unique_ptr CUDAGraph::EndCapture() { + ThrowErrorIfNotSupportCUDAGraph(); +#if CUDA_VERSION >= 10010 + PADDLE_ENFORCE_EQ(IsCapturing(), true, + errors::PermissionDenied("No CUDA Graph is capturing.")); + PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamEndCapture( + capturing_graph_->stream_, &(capturing_graph_->graph_))); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaGraphInstantiate(&(capturing_graph_->exec_graph_), + capturing_graph_->graph_, nullptr, nullptr, 0)); + VLOG(10) << "End to capture CUDA Graph with ID " << capturing_graph_->id_; + return std::move(capturing_graph_); +#endif +} + +bool CUDAGraph::IsValidCapturing() { + if (!IsCapturing()) return false; + cudaStreamCaptureStatus status; + CUDAGraphID id; + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamGetCaptureInfo(capturing_graph_->stream_, &status, &id)); + return status == cudaStreamCaptureStatusActive; +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph.h b/paddle/fluid/platform/cuda_graph.h new file mode 100644 index 0000000000000..55ec463556b45 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph.h @@ -0,0 +1,142 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "cuda.h" // NOLINT +#include "cuda_runtime.h" // NOLINT +#include "paddle/fluid/platform/type_defs.h" + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace platform { + +#if CUDA_VERSION >= 10010 +static void ThrowErrorIfNotSupportCUDAGraph() {} +#else +enum cudaStreamCaptureMode { + cudaStreamCaptureModeGlobal = 0, + cudaStreamCaptureModeThreadLocal = 1, + cudaStreamCaptureModeRelaxed = 2 +}; +static void ThrowErrorIfNotSupportCUDAGraph() { + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported when CUDA version >= 10.1")); +} +#endif + +// NOTE: Currently, we do not support to capture CUDA graph in parallel +// NOTE: Do not use this class directly because it should be used with +// the memory pool. +class CUDAGraph { + DISABLE_COPY_AND_ASSIGN(CUDAGraph); + + // Since the constructor would throw error is CUDA_VERSION < 10010. + // The non-static method of CUDAGraph need not check CUDA_VERSION + // again. + CUDAGraph() { ThrowErrorIfNotSupportCUDAGraph(); } + + public: + ~CUDAGraph() { Reset(); } + + CUDAGraphID ID() const { return id_; } + + void Replay(); + + void Reset(); + + void AddResetCallback(std::function callback) { + std::lock_guard guard(mtx_); + callbacks_.push_back(std::move(callback)); + } + + static void BeginCapture(platform::CUDAPlace place, cudaStream_t stream, + cudaStreamCaptureMode mode); + static std::unique_ptr EndCapture(); + static void AddResetCallbackDuringCapturing(std::function callback) { + capturing_graph_->AddResetCallback(std::move(callback)); + } + + // No need to add CUDA_VERSION macro because capturing_graph_ would + // always be nullptr (constructor throws error) + static bool IsCapturing() { return capturing_graph_ != nullptr; } + + static CUDAGraphID CapturingID() { return capturing_graph_->id_; } + + static platform::CUDAPlace CapturingPlace() { + return capturing_graph_->place_; + } + + // This API can be used to debug which GPU operation is not + // supported during capturing CUDA Graph. + static bool IsValidCapturing(); + + private: +#if CUDA_VERSION >= 10010 + cudaGraph_t graph_{nullptr}; + cudaGraphExec_t exec_graph_{nullptr}; +#endif + cudaStream_t stream_{nullptr}; + platform::CUDAPlace place_; + CUDAGraphID id_{0}; + std::vector> callbacks_; + bool is_reset_{false}; + std::mutex mtx_; + + static std::unique_ptr capturing_graph_; +}; + +#if CUDA_VERSION >= 10010 +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard( + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_CUDA_SUCCESS(cudaThreadExchangeStreamCaptureMode(&mode)); + // After cudaThreadExchangeStreamCaptureMode is called, + // the variable "mode" would be set to the old capturing mode. + old_mode_ = mode; + } + } + + ~CUDAGraphCaptureModeGuard() PADDLE_MAY_THROW { + if (UNLIKELY(CUDAGraph::IsCapturing())) { + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaThreadExchangeStreamCaptureMode(&old_mode_)); + } + } + + private: + cudaStreamCaptureMode old_mode_; +}; +#else +class CUDAGraphCaptureModeGuard { + DISABLE_COPY_AND_ASSIGN(CUDAGraphCaptureModeGuard); + + public: + explicit CUDAGraphCaptureModeGuard( + cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed) {} +}; +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.cc b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc new file mode 100644 index 0000000000000..4804d3f6ed301 --- /dev/null +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#include "paddle/fluid/memory/allocation/allocator_facade.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace platform { + +#ifdef PADDLE_WITH_CUDA +void BeginCUDAGraphCapture(platform::CUDAPlace place, + cudaStreamCaptureMode mode) { + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + + auto stream = dev_ctx->stream(); + CUDAGraph::BeginCapture(place, stream, mode); + auto id = CUDAGraph::CapturingID(); + memory::allocation::AllocatorFacade::Instance().PrepareMemoryPoolForCUDAGraph( + id); + AddResetCallbackIfCapturingCUDAGraph([id] { + memory::allocation::AllocatorFacade::Instance().RemoveMemoryPoolOfCUDAGraph( + id); + }); +} + +std::unique_ptr EndCUDAGraphCapture() { + auto place = CUDAGraph::CapturingPlace(); + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace(place); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + return CUDAGraph::EndCapture(); +} +#endif + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/cuda_graph_with_memory_pool.h b/paddle/fluid/platform/cuda_graph_with_memory_pool.h new file mode 100644 index 0000000000000..f9f0248e5153b --- /dev/null +++ b/paddle/fluid/platform/cuda_graph_with_memory_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph.h" +#endif + +namespace paddle { +namespace platform { + +// NOTE: These APIs are not thread-safe. +#ifdef PADDLE_WITH_CUDA +void BeginCUDAGraphCapture(platform::CUDAPlace place, + cudaStreamCaptureMode mode); +std::unique_ptr EndCUDAGraphCapture(); +#endif + +inline bool IsCUDAGraphCapturing() { +#ifdef PADDLE_WITH_CUDA + return CUDAGraph::IsCapturing(); +#else + return false; +#endif +} + +inline platform::CUDAPlace CUDAGraphCapturingPlace() { +#ifdef PADDLE_WITH_CUDA + return CUDAGraph::CapturingPlace(); +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + +// Add reset callback if CUDA Graph is capturing. +// Otherwise, invoke callback directly. +template +inline void AddResetCallbackIfCapturingCUDAGraph(Callback &&callback) { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(IsCUDAGraphCapturing())) { + return CUDAGraph::AddResetCallbackDuringCapturing( + std::forward(callback)); + } +#endif + callback(); +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/cusparse.cc b/paddle/fluid/platform/dynload/cusparse.cc index 2b41da541d9ae..2a1fe322dabcf 100644 --- a/paddle/fluid/platform/dynload/cusparse.cc +++ b/paddle/fluid/platform/dynload/cusparse.cc @@ -26,6 +26,10 @@ void *cusparse_dso_handle; #ifdef CUSPARSE_ROUTINE_EACH CUSPARSE_ROUTINE_EACH(DEFINE_WRAP); #endif + +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R2 +CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cusparse.h b/paddle/fluid/platform/dynload/cusparse.h index 98841949676e4..e5be003fadf06 100644 --- a/paddle/fluid/platform/dynload/cusparse.h +++ b/paddle/fluid/platform/dynload/cusparse.h @@ -41,8 +41,9 @@ extern void *cusparse_dso_handle; }; \ extern DynLoad__##__name __name -#ifndef _WIN32 -#if CUDA_VERSION >= 11020 +#if !defined(PADDLE_WITH_ARM) && !defined(_WIN32) +// APIs available after CUDA 11.0 +#if CUDA_VERSION >= 11000 #define CUSPARSE_ROUTINE_EACH(__macro) \ __macro(cusparseCreate); \ __macro(cusparseCreateCsr); \ @@ -51,12 +52,19 @@ extern void *cusparse_dso_handle; __macro(cusparseSpMM); \ __macro(cusparseDestroySpMat); \ __macro(cusparseDestroyDnMat); \ - __macro(cusparseDestroy); \ - __macro(cusparseSDDMM_bufferSize); \ - __macro(cusparseSDDMM_preprocess); \ - __macro(cusparseSDDMM); + __macro(cusparseDestroy); CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP); + +// APIs available after CUDA 11.2 +#if CUDA_VERSION >= 11020 +#define CUSPARSE_ROUTINE_EACH_R2(__macro) \ + __macro(cusparseSDDMM_bufferSize); \ + __macro(cusparseSDDMM_preprocess); \ + __macro(cusparseSDDMM); + +CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP) +#endif #endif #endif diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c420a5a64be06..7427060add8b1 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -31,6 +31,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #include +#include #include #include #include @@ -714,6 +715,7 @@ DEFINE_EXTERNAL_API_TYPE(curandStatus_t, CURAND_STATUS_SUCCESS, CURAND); DEFINE_EXTERNAL_API_TYPE(cudnnStatus_t, CUDNN_STATUS_SUCCESS, CUDNN); DEFINE_EXTERNAL_API_TYPE(cublasStatus_t, CUBLAS_STATUS_SUCCESS, CUBLAS); DEFINE_EXTERNAL_API_TYPE(cusolverStatus_t, CUSOLVER_STATUS_SUCCESS, CUSOLVER); +DEFINE_EXTERNAL_API_TYPE(cufftResult_t, CUFFT_SUCCESS, CUFFT); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) DEFINE_EXTERNAL_API_TYPE(ncclResult_t, ncclSuccess, NCCL); @@ -751,6 +753,8 @@ inline const char* GetErrorMsgUrl(T status) { return "https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/" "types.html#ncclresult-t"; break; + case platform::proto::ApiType::CUFFT: + return "https://docs.nvidia.com/cuda/cufft/index.html#cufftresult"; default: return "Unknown type of External API, can't get error message URL!"; break; @@ -839,6 +843,7 @@ template std::string GetExternalErrorMsg(curandStatus_t); template std::string GetExternalErrorMsg(cudnnStatus_t); template std::string GetExternalErrorMsg(cublasStatus_t); template std::string GetExternalErrorMsg(cusolverStatus_t); +template std::string GetExternalErrorMsg(cufftResult_t); #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) template std::string GetExternalErrorMsg(ncclResult_t); #endif @@ -899,6 +904,15 @@ inline std::string build_nvidia_error_msg(cusolverStatus_t stat) { return sout.str(); } +/*************** CUFFT ERROR ***************/ +inline bool is_error(cufftResult_t stat) { return stat != CUFFT_SUCCESS; } + +inline std::string build_nvidia_error_msg(cufftResult_t stat) { + std::ostringstream sout; + sout << "CUFFT error(" << stat << "). " << GetExternalErrorMsg(stat); + return sout.str(); +} + /**************** NCCL ERROR ****************/ #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) inline bool is_error(ncclResult_t nccl_result) { diff --git a/paddle/fluid/platform/enforce_test.cc b/paddle/fluid/platform/enforce_test.cc index 95a852ad6e92a..c6d5f171ddce4 100644 --- a/paddle/fluid/platform/enforce_test.cc +++ b/paddle/fluid/platform/enforce_test.cc @@ -9,10 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/platform/enforce.h" + #include #include "gtest/gtest.h" -#include "paddle/fluid/platform/enforce.h" TEST(ENFORCE, OK) { PADDLE_ENFORCE(true, paddle::platform::errors::Unavailable( @@ -418,6 +419,25 @@ TEST(enforce, cuda_success) { "negative vector size, for example).To correct: ensure that all the " "parameters being passed have valid values")); + EXPECT_TRUE(CheckCudaStatusSuccess(CUFFT_SUCCESS)); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_PLAN, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_ALLOC_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_TYPE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_VALUE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INTERNAL_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_EXEC_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_SETUP_FAILED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_SIZE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_UNALIGNED_DATA, "CUFFT error")); + EXPECT_TRUE( + CheckCudaStatusFailure(CUFFT_INCOMPLETE_PARAMETER_LIST, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_INVALID_DEVICE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_PARSE_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NO_WORKSPACE, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_IMPLEMENTED, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_LICENSE_ERROR, "CUFFT error")); + EXPECT_TRUE(CheckCudaStatusFailure(CUFFT_NOT_SUPPORTED, "CUFFT error")); + #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL) EXPECT_TRUE(CheckCudaStatusSuccess(ncclSuccess)); EXPECT_TRUE(CheckCudaStatusFailure(ncclUnhandledCudaError, "NCCL error")); diff --git a/paddle/fluid/platform/external_error.proto b/paddle/fluid/platform/external_error.proto index 2094de7e10f69..cbbf803492e64 100644 --- a/paddle/fluid/platform/external_error.proto +++ b/paddle/fluid/platform/external_error.proto @@ -24,6 +24,7 @@ enum ApiType { CUBLAS = 3; CUSOLVER = 4; NCCL = 5; + CUFFT = 6; } message MessageDesc { diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index b97c3106439be..dd65d743fad31 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -121,6 +121,13 @@ PADDLE_DEFINE_EXPORTED_string( "If proveided, it will be passed to aclInit()."); PADDLE_DEFINE_EXPORTED_int32(min_loss_scaling, 1, "set minmum loss scaling value!"); +PADDLE_DEFINE_EXPORTED_string( + npu_precision_mode, "", + "NPU operator precision mode, options are 'force_fp32', 'force_fp16', " + "'allow_fp32_to_fp16', 'must_keep_origin_dtype' and " + "'allow_mix_precision'. If you want to use the default mode (" + "allow_fp32_to_fp16), set this to empty string. For more details, " + "please refer to the documents"); #endif #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -673,3 +680,40 @@ PADDLE_DEFINE_EXPORTED_int32(get_host_by_name_time, 120, PADDLE_DEFINE_EXPORTED_bool( apply_pass_to_program, false, "It controls whether to apply IR pass to program when using Fleet APIs"); + +/** + * Distributed related FLAG + * Name: FLAGS_allreduce_record_one_event + * Since Version: 2.2.0 + * Value Range: bool, default=false + * Example: FLAGS_allreduce_record_one_event=true makes the allreduce + * operations would only wait one event instead of multiple events. + * Note: Make the allreduce operations would only wait one event instead of + * multiple events. Currently, only fuse allreduce supports this. + * Otherwise, the precision may be wrong. + */ +PADDLE_DEFINE_EXPORTED_bool(allreduce_record_one_event, false, + "It controls whether the allreduce operations " + "would only wait one event instead of multiple " + "events. Currently, only fuse allreduce supports " + "this. Otherwise, the precision may be wrong."); + +/* + * CINN related FLAG + * Name: FLAGS_use_cinn + * Since Version: 2.3 + * Value Range: bool, default=false + * Example: FLAGS_use_cinn=true would run PaddlePaddle using CINN + */ +PADDLE_DEFINE_EXPORTED_bool( + use_cinn, false, "It controls whether to run PaddlePaddle using CINN"); + +DEFINE_int32(record_pool_max_size, 2000000, + "SlotRecordDataset slot record pool max size"); +DEFINE_int32(slotpool_thread_num, 1, "SlotRecordDataset slot pool thread num"); +DEFINE_bool(enable_slotpool_wait_release, false, + "enable slotrecord obejct wait release, default false"); +DEFINE_bool(enable_slotrecord_reset_shrink, false, + "enable slotrecord obejct reset shrink memory, default false"); +DEFINE_bool(enable_ins_parser_file, false, + "enable parser ins file , default false"); diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 76edb3910ccce..c624ba94b74a3 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -14,12 +14,15 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #include +#include +#include #include "gflags/gflags.h" #include "paddle/fluid/platform/cuda_device_guard.h" #ifdef PADDLE_WITH_HIP #include "paddle/fluid/platform/dynload/miopen.h" #else +#include "paddle/fluid/platform/cuda_graph.h" #include "paddle/fluid/platform/dynload/cudnn.h" #endif #include "paddle/fluid/memory/malloc.h" @@ -39,6 +42,10 @@ DECLARE_uint64(gpu_memory_limit_mb); constexpr static float fraction_reserve_gpu_memory = 0.05f; +static std::once_flag g_device_props_size_init_flag; +static std::vector> g_device_props_init_flags; +static std::vector g_device_props; + USE_GPU_MEM_STAT; namespace paddle { namespace platform { @@ -297,6 +304,44 @@ std::vector GetSelectedDevices() { return devices; } +const gpuDeviceProp &GetDeviceProperties(int id) { + std::call_once(g_device_props_size_init_flag, [&] { + int gpu_num = 0; + gpu_num = platform::GetCUDADeviceCount(); + g_device_props_init_flags.resize(gpu_num); + g_device_props.resize(gpu_num); + for (int i = 0; i < gpu_num; ++i) { + g_device_props_init_flags[i] = std::make_unique(); + } + }); + + if (id == -1) { + id = platform::GetCurrentDeviceId(); + } + + if (id < 0 || id >= static_cast(g_device_props.size())) { + PADDLE_THROW(platform::errors::OutOfRange( + "The device id %d is out of range [0, %d), where %d is the number of " + "devices on this machine. Because the device id should be greater than " + "or equal to zero and smaller than the number of gpus. Please input " + "appropriate device again!", + id, static_cast(g_device_props.size()), + static_cast(g_device_props.size()))); + } + + std::call_once(*(g_device_props_init_flags[id]), [&] { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaGetDeviceProperties(&g_device_props[id], id)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + hipGetDeviceProperties(&g_device_props[id], id)); +#endif + }); + + return g_device_props[id]; +} + void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), @@ -513,6 +558,7 @@ class RecordedCudaMallocHelper { #ifdef PADDLE_WITH_HIP auto result = hipMalloc(ptr, size); #else + CUDAGraphCaptureModeGuard capture_mode_guard; auto result = cudaMalloc(ptr, size); #endif if (result == gpuSuccess) { diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index ef7f93a61dbfb..401873dcd77da 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -67,6 +67,9 @@ dim3 GetGpuMaxGridDimSize(int); //! Get a list of device ids from environment variable or use all. std::vector GetSelectedDevices(); +//! Get the properties of the ith GPU device. +const gpuDeviceProp &GetDeviceProperties(int id); + //! Set the GPU device id for next execution. void SetDeviceId(int device_id); diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index f14f92cb51fdb..37fa58e423db7 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -531,7 +531,13 @@ inline bool HasOpBFLOAT16DataType(const paddle::framework::OpDesc* op) { inline bool HasOpFLOAT32DataType(const paddle::framework::OpDesc* op) { return op->GetAttrIfExists("mkldnn_data_type") == "float32"; } + enum class RNNReorderType { PP_NTC, PP_TNC, NTC_PP, TNC_PP }; +template +bool constexpr is_int8() { + return std::is_same::value || std::is_same::value; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 1aa8c0cdb57f9..084b47bb3c7a3 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -527,7 +527,8 @@ class MKLDNNHandlerT { const mkldnn::memory::desc& user_md, const mkldnn::memory::desc& target_md, void* ptr, const std::string& suffix, bool is_persistent = false, - std::function(const F*)> custom_reorder_func = {}) { + std::function(const F*)> custom_reorder_func = {}, + const std::vector& scale_data = {1.0f}, int mask = 0) { const auto target_key = key_ + suffix + "_target"; const auto key_reorder_p = key_ + suffix + "reorder_p"; const auto user_key = key_ + suffix + "_user"; @@ -546,8 +547,17 @@ class MKLDNNHandlerT { std::make_shared(user_md, engine_, ptr); if (user_md != target_md) { target_memory_p = std::make_shared(target_md, engine_); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + dnnl::reorder::primitive_desc reorder_pdesc; + if (is_int8()) { + dnnl::primitive_attr attr; + attr.set_output_scales(mask, scale_data); + reorder_pdesc = dnnl::reorder::primitive_desc(*user_memory_p, + *target_memory_p, attr); + } else { + reorder_pdesc = + dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p); + } + auto reorder_p = std::make_shared(reorder_pdesc); dev_ctx_.SetBlob(key_reorder_p, reorder_p); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -597,201 +607,6 @@ class MKLDNNHandlerT { std::shared_ptr bwd_w_pd_; }; -// TODO(grygielski) this class will be deleted later. -class MKLDNNHandler { - public: - MKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : dev_ctx_(dev_ctx), - engine_(engine), - key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)) { - platform::MKLDNNDeviceContext::tls().log_lib_version(); - } - - std::shared_ptr AcquireSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_src_mem_p"); - } - - std::shared_ptr AcquireDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_dst_mem_p"); - } - - std::shared_ptr AcquireDiffSrcMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_src_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_diff_dst_mem_p"); - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, void* ptr, const std::string& suffix) { - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemoryFromPrimitive( - mkldnn::memory::desc md, const std::string& suffix) { - const auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - mem_p = std::make_shared(md, engine_); - dev_ctx_.SetBlob(local_key, mem_p); - } - return mem_p; - } - - // This incarnation of AcquireMemory can call user function eg. custom reorder - // or preprocessing routine if needed - std::shared_ptr AcquireMemory( - const mkldnn::memory::desc& md, void* ptr, const std::string& suffix, - user_function custom_func = {}) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - // Call custom reorder/preprocessing func if available - if (custom_func) { - auto reordered_data = custom_func(reinterpret_cast(ptr)); - dev_ctx_.SetBlob(local_key + "-custom_reorder", reordered_data); - ptr = reinterpret_cast(reordered_data.get()); - } - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::vector& dims, const mkldnn::memory::data_type dtype, - const MKLDNNMemoryFormat& fmt, void* ptr, const std::string& suffix) { - /*Generate key*/ - auto local_key = key_ + suffix; - auto mem_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - if (mem_p == nullptr) { - auto md = mkldnn::memory::desc(dims, dtype, fmt); - - mem_p = std::make_shared(md, engine_, ptr); - dev_ctx_.SetBlob(local_key, mem_p); - } else { - mem_p->set_data_handle(ptr); - } - return mem_p; - } - - std::shared_ptr AcquireMemory( - const std::shared_ptr& user_memory_p, - const std::shared_ptr& target_memory_p, - const std::string& suffix, - std::vector& pipeline) { // NOLINT - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto stored_reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - - if (stored_reorder_p) { - pipeline.push_back(*stored_reorder_p); - } else { - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - - return target_memory_p; - } - - std::shared_ptr AcquireMemory( - mkldnn::memory::desc& md, // NOLINT - mkldnn::memory::desc& user_md, // NOLINT - const std::shared_ptr user_memory_p, - const std::string& suffix, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - // create reorder primitive if the input format is not the preferred one - auto local_key = key_ + suffix; - auto key_reorder_p = key_ + suffix + "reorder_p"; - - auto target_memory_p = - std::static_pointer_cast(dev_ctx_.GetBlob(local_key)); - - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); - - if (target_memory_p == nullptr) { - target_memory_p = user_memory_p; - if (md != user_md) { - target_memory_p = std::make_shared(md, engine_); - std::shared_ptr reorder_pd; - if (is_INT8) { - mkldnn::primitive_attr - attri; // attribute for int8 weights and bias data reorder. - attri.set_output_scales(mask, scale_data); - - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p, attri)); - } else { - reorder_pd = std::shared_ptr( - new mkldnn::reorder::primitive_desc(*user_memory_p, - *target_memory_p)); - } - auto reorder_p = - std::shared_ptr(new mkldnn::reorder(*reorder_pd)); - dev_ctx_.SetBlob(key_reorder_p, reorder_p); - - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - dev_ctx_.SetBlob(local_key, target_memory_p); - } else if (!is_persistent) { - // Make reorder if needed - auto reorder_p = std::static_pointer_cast( - dev_ctx_.GetBlob(key_reorder_p)); - if (reorder_p != nullptr) { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - reorder_p->execute(astream, {{MKLDNN_ARG_FROM, *user_memory_p}, - {MKLDNN_ARG_TO, *target_memory_p}}); - astream.wait(); - } - } - return target_memory_p; - } - - protected: - const MKLDNNDeviceContext& dev_ctx_; - mkldnn::engine engine_; - std::string key_; -}; - template class BinaryMKLDNNHandler : public platform::MKLDNNHandlerNoCachingT { @@ -1143,362 +958,6 @@ class ReorderMKLDNNHandler { mkldnn::engine engine_; }; -template -struct convolutional_algorithm; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = mkldnn::algorithm::convolution_direct; -}; - -template <> -struct convolutional_algorithm { - static constexpr mkldnn::algorithm T = - mkldnn::algorithm::deconvolution_direct; -}; - -template -class ConvMKLDNNTemplateHandler : public MKLDNNHandler { - public: - ConvMKLDNNTemplateHandler(const platform::MKLDNNDeviceContext& dev_ctx, - mkldnn::engine engine, const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) {} - - // TODO(jczaja): remove after conv int8 is adapted - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key) { - conv_pd_ = conv_pd; - } - - ConvMKLDNNTemplateHandler( - std::shared_ptr conv_pd, - std::shared_ptr - conv_bwd_data_pd, - std::shared_ptr - conv_bwd_weights_pd, - const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, - const std::string& base_key) - : platform::MKLDNNHandler(dev_ctx, engine, base_key), - conv_pd_(conv_pd), - conv_bwd_weights_pd_(conv_bwd_weights_pd), - conv_bwd_data_pd_(conv_bwd_data_pd) { - // If we are in Grad operatgor then update a key with BWD suffix to - // distinguish from FWD memory primitives - key_ += "-BWD"; - } - - size_t GetDstMemorySize() const { return conv_pd_->dst_desc().get_size(); } - - MKLDNNMemoryFormat GetDstFormat() const { - return paddle::platform::GetMKLDNNFormat(conv_pd_->dst_desc()); - } - - size_t GetDiffWeightsMemorySize() const { - return conv_bwd_weights_pd_->diff_weights_desc().get_size(); - } - - size_t GetDiffSourceMemorySize() const { - return conv_bwd_data_pd_->diff_src_desc().get_size(); - } - - std::shared_ptr AcquireSrcMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_bwd_weights_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, - "@weights-src_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffDstMemoryFromWeightsPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_weights_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@weights-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffWeightsMemoryFromWeightsPrimitive( - void) { - return this->AcquireMemoryFromPrimitive( - conv_bwd_weights_pd_->diff_weights_desc(), "@diff_weights_mem_p"); - } - - std::shared_ptr AcquireDiffDstMemoryFromDataPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto diff_dst_pd = conv_bwd_data_pd_->diff_dst_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(diff_dst_pd, user_pd, user_memory_p, - "@data-diff_dst_mem_p", pipeline); - } - - std::shared_ptr AcquireWeightsMemoryFromDataPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline) { // NOLINT - auto weights_pd = conv_bwd_data_pd_->weights_desc(); - auto user_pd = user_weights_memory_p->get_desc(); - return this->AcquireMemory(weights_pd, user_pd, user_weights_memory_p, - "@data-weights_mem_p", pipeline); - } - - std::shared_ptr AcquireResidualDataMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_residual_data_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromResidualDataMemory( - const std::shared_ptr& user_residual_memory_p, - void* dst_ptr, - std::vector& pipeline) { // NOLINT - return this->AcquireMemory(user_residual_memory_p, - this->AcquireDstMemoryFromPrimitive(dst_ptr), - "@residual_data_mem_p", pipeline); - } - - std::shared_ptr AcquireDiffSrcMemoryFromDataPrimitive( - void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_bwd_data_pd_->diff_src_desc(), - ptr, "@diff_src_mem_p"); - } - - std::shared_ptr AcquireDstMemoryFromPrimitive(void* ptr) { - return this->AcquireMemoryFromPrimitive(conv_pd_->dst_desc(), ptr, - "@dst_mem_p"); - } - - std::shared_ptr AcquireSrcMemoryFromPrimitive( - const std::shared_ptr user_memory_p, - std::vector& pipeline) { // NOLINT - auto src_pd = conv_pd_->src_desc(); - auto user_pd = user_memory_p->get_desc(); - return this->AcquireMemory(src_pd, user_pd, user_memory_p, "@src_mem_p", - pipeline); - } - - std::shared_ptr AcquireWeightsMemory( - const mkldnn::memory::desc& md, void* ptr, - user_function custom_func = {}) { - return this->AcquireMemory(md, ptr, "@user_weights_mem_p", custom_func); - } - - std::shared_ptr AcquireBiasMemory( - const mkldnn::memory::desc& md, void* ptr) { - return this->AcquireMemory(md, ptr, "@user_bias_mem_p"); - } - - std::shared_ptr AcquireWeightsMemoryFromPrimitive( - const std::shared_ptr user_weights_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, int mask = 0) { - auto user_weights_pd = user_weights_memory_p->get_desc(); - auto weights_pd = conv_pd_->weights_desc(); - return this->AcquireMemory( - weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent, is_INT8, scale_data, mask); - } - - std::shared_ptr AcquireBiasMemoryFromPrimitive( - const std::shared_ptr user_bias_memory_p, - std::vector& pipeline, // NOLINT - bool is_persistent = false, bool is_INT8 = false, - std::vector scale_data = {1.0f}, - int mask = 0) { // NOLINT - auto user_bias_pd = user_bias_memory_p->get_desc(); - auto bias_pd = conv_pd_->bias_desc(); - return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline, is_persistent, is_INT8, - scale_data, mask); - } - - mkldnn::primitive_attr CreatePostOps( - std::string fuse_activation, float fuse_alpha, float fuse_beta, - bool fuse_residual_conn, const std::vector output_shift_scale = {}, - float sum_scale = 1.0f) const { - mkldnn::primitive_attr conv_attr; - mkldnn::post_ops post_operations; - if (output_shift_scale.size() > 0) { - int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; - conv_attr.set_output_scales(mask, output_shift_scale); - } - // Fusion with Elementwise layer relies on adding a sum post-operation with - // the scale parameter. It is assumed that when fuse_residual_connection is - // true, the output tensor contains the data coming from residual - // connection. The result of this post_op is: - // Output = scale * Output + Conv_Out. - if (fuse_residual_conn) { - post_operations.append_sum(sum_scale); - } - // Fusion with ReLU layer is executed through the PostOps feature. Create a - // PostOps object and configure it to execute an eltwise relu operation. - if (fuse_activation == "relu" || fuse_activation == "leaky_relu") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "relu6") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, - mkldnn::algorithm::eltwise_bounded_relu, - fuse_alpha, fuse_beta); - } else if (fuse_activation == "swish") { - constexpr float scale = 1.0f; - post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_swish, - fuse_alpha, fuse_beta); - } - conv_attr.set_post_ops(post_operations); - return conv_attr; - } - - std::shared_ptr - AcquireConvolutionPrimitiveDescriptor( - const mkldnn::memory::desc& src, const mkldnn::memory::desc& weights, - paddle::optional bias, - const mkldnn::memory::desc& dst, const std::vector& strides, - const std::vector& dilations, - const std::vector& paddings, const mkldnn::engine& engine, - const std::string& fuse_activation, float fuse_alpha, float fuse_beta, - const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind, - const std::vector output_shift_scale = {}, - const float sum_scale = 1.0f) { - // Conv PD has to be passed to Grad op that - // may be exxecuted by diffrent thread, hence - // for that one we use key that does not contain TID - const std::string key_conv_pd = key_ + "@conv_pd"; - - conv_pd_ = std::static_pointer_cast( - dev_ctx_.GetBlob(key_conv_pd)); - - if (conv_pd_ == nullptr) { - mkldnn::memory::dims stride_dims = strides; - mkldnn::memory::dims dilations_dims = dilations; - auto mkldnn_paddings = ToMkldnnPadding(paddings); - - auto conv_desc = - bias ? typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, *bias, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]) - : typename forward_t::desc( - fwd_prop_kind, convolutional_algorithm::T, src, - weights, dst, stride_dims, dilations_dims, - mkldnn_paddings[0], mkldnn_paddings[1]); - - mkldnn::primitive_attr conv_attr = - CreatePostOps(fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, output_shift_scale, sum_scale); - - conv_pd_.reset( - new typename forward_t::primitive_desc(conv_desc, conv_attr, engine)); - // Save conv_pd/src_memory/weights_memory for backward pass - dev_ctx_.SetBlob(key_conv_pd, conv_pd_); - } - - return conv_pd_; - } - - std::shared_ptr AcquireConvolution() { - auto prim_key = key_ + "@conv_p"; - auto conv_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_p == nullptr) { - conv_p = std::make_shared(*conv_pd_); - - dev_ctx_.SetBlob(prim_key, conv_p); - } - return conv_p; - } - - std::shared_ptr AcquireConvolutionBackwardWeights() { - auto prim_key = key_ + "@conv_bwd_weights_p"; - auto conv_bwd_weights_p = std::static_pointer_cast( - dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_weights_p == nullptr) { - // create backward conv primitive for weights - conv_bwd_weights_p = - std::make_shared(*conv_bwd_weights_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_weights_p); - } - return conv_bwd_weights_p; - } - - std::shared_ptr AcquireConvolutionBackwardData() { - auto prim_key = key_ + "@conv_bwd_data_p"; - auto conv_bwd_data_p = - std::static_pointer_cast(dev_ctx_.GetBlob(prim_key)); - if (conv_bwd_data_p == nullptr) { - conv_bwd_data_p = std::make_shared(*conv_bwd_data_pd_); - dev_ctx_.SetBlob(prim_key, conv_bwd_data_p); - } - return conv_bwd_data_p; - } - - private: - std::shared_ptr conv_pd_; - std::shared_ptr - conv_bwd_weights_pd_; - std::shared_ptr conv_bwd_data_pd_; -}; - -using ConvMKLDNNHandler = - ConvMKLDNNTemplateHandler; - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); - return dst_memory_p; -} - -template -static std::shared_ptr SetDstMemory( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const framework::Tensor* residual_param, - const mkldnn::memory::desc& user_residual_md, - const std::shared_ptr& handler, - std::vector* pipeline) { - const T* residual_param_data = residual_param->data(); - PADDLE_ENFORCE_NOT_NULL( - residual_param_data, - platform::errors::PreconditionNotMet("Residual parameter is required for " - "the DNNL conv+elementwise_add " - "fusion, but now it is missing.")); - std::shared_ptr user_residual_memory_p = - handler->AcquireResidualDataMemory(user_residual_md, - to_void_cast(residual_param_data)); - T* output_data = output->mutable_data(ctx.GetPlace()); - std::shared_ptr dst_memory_p = - handler->AcquireDstMemoryFromResidualDataMemory( - user_residual_memory_p, to_void_cast(output_data), *pipeline); - return dst_memory_p; -} - -template -static void SetDstMemoryHandler( - const framework::ExecutionContext& ctx, framework::Tensor* output, - const std::shared_ptr& handler, - std::shared_ptr dst_memory_p) { - T* output_data = - output->mutable_data(ctx.GetPlace(), handler->GetDstMemorySize()); - dst_memory_p->set_data_handle(to_void_cast(output_data)); -} - template static void SetDstMemoryQuantized( const framework::ExecutionContext& ctx, framework::Tensor* output, @@ -1524,5 +983,6 @@ static void SetDstMemoryQuantized( dst_memory.reset( new mkldnn::memory(*dst_md, engine, to_void_cast(output_data))); } + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/type_defs.h b/paddle/fluid/platform/type_defs.h index 31784a0426580..88a2d16472fa7 100644 --- a/paddle/fluid/platform/type_defs.h +++ b/paddle/fluid/platform/type_defs.h @@ -27,11 +27,14 @@ namespace paddle { using gpuStream_t = hipStream_t; using gpuError_t = hipError_t; using gpuEvent_t = hipEvent_t; +using gpuDeviceProp = hipDeviceProp_t; #else #define gpuSuccess cudaSuccess using gpuStream_t = cudaStream_t; using gpuError_t = cudaError_t; using gpuEvent_t = cudaEvent_t; +using gpuDeviceProp = cudaDeviceProp; #endif +using CUDAGraphID = unsigned long long; // NOLINT } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 22778013f2390..875e6af9652a2 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -7,7 +7,7 @@ set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapp feed_fetch_method pass generate_pass pass_builder parallel_executor profiler layer tracer engine scope_pool analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry ps_gpu_wrapper custom_operator - cost_model) + cost_model cuda_graph_with_memory_pool) if (WITH_PSCORE) set(PYBIND_DEPS ${PYBIND_DEPS} ps_service) diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 41cf0189d3d9d..7a32d8729fc6c 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -309,8 +309,6 @@ void BindDataset(py::module *m) { &framework::Dataset::SetFleetSendSleepSeconds, py::call_guard()) .def("enable_pv_merge", &framework::Dataset::EnablePvMerge, - py::call_guard()) - .def("set_heter_ps", &framework::Dataset::SetHeterPs, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/fluid/pybind/fleet_wrapper_py.cc b/paddle/fluid/pybind/fleet_wrapper_py.cc index 873476629cb78..d8142f717baed 100644 --- a/paddle/fluid/pybind/fleet_wrapper_py.cc +++ b/paddle/fluid/pybind/fleet_wrapper_py.cc @@ -76,6 +76,8 @@ void BindFleetWrapper(py::module* m) { .def("shrink_sparse_table", &framework::FleetWrapper::ShrinkSparseTable) .def("shrink_dense_table", &framework::FleetWrapper::ShrinkDenseTable) .def("print_table_stat", &framework::FleetWrapper::PrintTableStat) + .def("set_file_num_one_shard", + &framework::FleetWrapper::SetFileNumOneShard) .def("client_flush", &framework::FleetWrapper::ClientFlush) .def("load_from_paddle_model", &framework::FleetWrapper::LoadFromPaddleModel) diff --git a/paddle/fluid/pybind/ir.cc b/paddle/fluid/pybind/ir.cc index e27e3674eeeb5..050bfc967daa1 100644 --- a/paddle/fluid/pybind/ir.cc +++ b/paddle/fluid/pybind/ir.cc @@ -125,7 +125,15 @@ void BindGraph(py::module *m) { return_value_policy::reference) .def("resolve_hazard", &Graph::ResolveHazard) .def("origin_program_desc", &Graph::OriginProgram, - return_value_policy::reference); + return_value_policy::reference) + .def("sub_graph_size", &Graph::SubGraphsSize) + .def("get_sub_graph", [](Graph &self, int i) { + /* Here we use a lambda function as an empty deleter to avoid the double + free of smart pointer. + Otherwise, this shared pointer will be free both in python and + cpp scope, which will lead a core dumped. */ + return std::shared_ptr(self.GetSubGraph(i), [](Graph *) {}); + }); } void BindNode(py::module *m) { diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index f9d11e8154f43..01d101909b549 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -54,6 +54,7 @@ std::map> op_ins_map = { {"gather", {"X", "Index", "Axis"}}, {"roi_pool", {"X", "ROIs", "RoisNum"}}, {"roi_align", {"X", "ROIs", "RoisNum"}}, + {"psroi_pool", {"X", "ROIs", "RoisNum"}}, {"collect_fpn_proposals", {"MultiLevelRois", "MultiLevelScores", "MultiLevelRoIsNum"}}, {"distribute_fpn_proposals", {"FpnRois", "RoisNum"}}, @@ -71,6 +72,9 @@ std::map> op_ins_map = { {"adam", {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow", "MasterParam"}}, + {"adamw", + {"Param", "Grad", "LearningRate", "Moment1", "Moment2", "Beta1Pow", + "Beta2Pow", "MasterParam"}}, }; // NOTE(zhiqiu): Like op_ins_map. @@ -110,6 +114,9 @@ std::map> op_outs_map = { {"adam", {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, + {"adamw", + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, }; // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are @@ -129,7 +136,8 @@ std::map> op_passing_outs_map = { {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", "MasterParamOut"}}, {"adamw", - {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut"}}, + {"ParamOut", "Moment1Out", "Moment2Out", "Beta1PowOut", "Beta2PowOut", + "MasterParamOut"}}, {"average_accumulates", {"out_sum_1", "out_sum_2", "out_sum_3", "out_num_accumulates", "out_old_num_accumulates", "out_num_updates"}}, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c00f529f61793..80350abb4fe21 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -125,6 +125,8 @@ limitations under the License. */ #include "paddle/fluid/platform/xpu/xpu_info.h" #endif +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" + #ifdef PADDLE_WITH_CRYPTO #include "paddle/fluid/pybind/crypto.h" #endif @@ -520,6 +522,19 @@ PYBIND11_MODULE(core_noavx, m) { m.def("nccl_version", &GetNCCLVersion); #endif + m.def("is_cuda_graph_capturing", &platform::IsCUDAGraphCapturing); +#ifdef PADDLE_WITH_CUDA + py::class_(m, "CUDAGraph") + .def_static("begin_capture", + [](platform::CUDAPlace place, int mode) { + platform::BeginCUDAGraphCapture( + place, static_cast(mode)); + }) + .def_static("end_capture", &platform::EndCUDAGraphCapture) + .def("replay", &platform::CUDAGraph::Replay) + .def("reset", &platform::CUDAGraph::Reset); +#endif + m.def("wait_device", [](const platform::Place &place) { platform::DeviceContextPool::Instance().Get(place)->Wait(); }); @@ -537,11 +552,11 @@ PYBIND11_MODULE(core_noavx, m) { DLTensor dl = dmt->dl_tensor; framework::Tensor tensor; - if (dl.ctx.device_type == kDLCPU) { + if (dl.device.device_type == kDLCPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (dl.ctx.device_type == kDLGPU) { + if (dl.device.device_type == kDLGPU) { paddle::framework::TensorFromDLPack(dl, &tensor); } #endif @@ -721,6 +736,17 @@ PYBIND11_MODULE(core_noavx, m) { paddle::framework::proto::VarType::Type type) { return reinterpret_cast(self.mutable_data(place, type)); }) + .def("_copy_from", + [](framework::Tensor &self, const framework::Tensor &other, + const platform::Place &place, int64_t batch_size) { + if (batch_size < 0) { + framework::TensorCopy(other, place, &self); + } else { + auto sliced = other.Slice(0, batch_size); + framework::TensorCopy(sliced, place, &self); + } + }, + py::arg("tensor"), py::arg("place"), py::arg("batch_size") = -1) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, @@ -776,8 +802,7 @@ PYBIND11_MODULE(core_noavx, m) { .def("_to_dlpack", [](framework::Tensor &self) { DLPackTensor dlpack_tensor(self, 1); - DLManagedTensor *dmt = - dlpack_tensor.ToCudfCompatibleDLManagedTensor(); + DLManagedTensor *dmt = dlpack_tensor.ToDLManagedTensor(); auto capsule = py::capsule( static_cast(dmt), "dltensor", [](PyObject *ptr) { if (ptr) { @@ -2285,7 +2310,39 @@ All parameter, weight, gradient are variables in Paddle. m.def("op_support_gpu", OpSupportGPU); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) m.def("get_cuda_device_count", platform::GetCUDADeviceCount); - m.def("cuda_empty_cache", platform::EmptyCache); + m.def("cuda_empty_cache", [] { + for (int dev_id : platform::GetSelectedDevices()) { + auto *dev_ctx = platform::DeviceContextPool::Instance().GetByPlace( + platform::CUDAPlace(dev_id)); + dev_ctx->cudnn_workspace_handle().ResetWorkspace(); + } + platform::EmptyCache(); + }); + m.def("get_device_properties", + [](int id) -> const gpuDeviceProp & { + return platform::GetDeviceProperties(id); + }, + py::return_value_policy::copy); + + py::class_(m, "_gpuDeviceProperties") + .def_readonly("name", &gpuDeviceProp::name) + .def_readonly("major", &gpuDeviceProp::major) + .def_readonly("minor", &gpuDeviceProp::minor) + .def_readonly("is_multi_gpu_board", &gpuDeviceProp::isMultiGpuBoard) + .def_readonly("is_integrated", &gpuDeviceProp::integrated) + .def_readonly("multi_processor_count", + &gpuDeviceProp::multiProcessorCount) + .def_readonly("total_memory", &gpuDeviceProp::totalGlobalMem) + .def("__repr__", [](const gpuDeviceProp &gpu_device_prop) { + std::ostringstream stream; + stream << "_gpuDeviceProperties(name='" << gpu_device_prop.name + << "', major=" << gpu_device_prop.major + << ", minor=" << gpu_device_prop.minor << ", total_memory=" + << gpu_device_prop.totalGlobalMem / (1024 * 1024) + << "MB, multi_processor_count=" + << gpu_device_prop.multiProcessorCount << ")"; + return stream.str(); + }); #if !defined(PADDLE_WITH_HIP) && !defined(_WIN32) m.def("nvprof_init", platform::CudaProfilerInit); @@ -3172,6 +3229,13 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, bool fix_op_run_order) { self.fix_op_run_order_ = fix_op_run_order; }) + .def_property("allow_cuda_graph_capture", + [](const BuildStrategy &self) { + return self.allow_cuda_graph_capture_; + }, + [](BuildStrategy &self, bool allow_cuda_graph_capture) { + self.allow_cuda_graph_capture_ = allow_cuda_graph_capture; + }) .def("_copy", [](const BuildStrategy &self) { auto new_bs = self; @@ -3229,6 +3293,18 @@ All parameter, weight, gradient are variables in Paddle. BOOST_GET(paddle::framework::FetchUnmergedList, ret))); } }) + .def("run_from_cinn", + [](ParallelExecutor &self, + const std::unordered_map &feed_tensors, + const std::vector &fetch_names) -> py::object { + paddle::framework::FetchResultType ret; + { + pybind11::gil_scoped_release release; + ret = self.RunFromCinn(feed_tensors, fetch_names); + } + return py::cast( + std::move(BOOST_GET(paddle::framework::FetchList, ret))); + }) .def("device_count", &ParallelExecutor::DeviceCount); BindFleetWrapper(&m); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 0283de66ba5af..e6320d5bd154d 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -89,7 +89,7 @@ if "%WITH_PYTHON%" == "ON" ( pip install -r %work_dir%\python\requirements.txt --user if !ERRORLEVEL! NEQ 0 ( echo pip install requirements.txt failed! - exit /b 7 + exit /b 5 ) ) @@ -138,6 +138,17 @@ if %day_now% NEQ %day_before% ( echo %day_now% > %cache_dir%\day.txt type %cache_dir%\day.txt rmdir %BUILD_DIR% /s/q + + : clear third party cache every once in a while + if %day_now% EQU 21 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 11 ( + rmdir %cache_dir%\third_party /s/q + ) + if %day_now% EQU 01 ( + rmdir %cache_dir%\third_party /s/q + ) goto :mkbuild ) @@ -309,7 +320,7 @@ if %GENERATOR% == "Ninja" ( pip install ninja if %errorlevel% NEQ 0 ( echo pip install ninja failed! - exit /b 7 + exit /b 5 ) ) @@ -333,24 +344,6 @@ rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 rem clcache.exe -M 21474836480 rem ------set third_party cache dir------ -: clear third party cache every once in a while -for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# -set day_now=%datetime:~6,2% -set day_before=-1 -set /p day_before=< %cache_dir%\day.txt -if %day_now% NEQ %day_before% ( - echo %day_now% > %cache_dir%\day.txt - type %cache_dir%\day.txt - if %day_now% EQU 21 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 11 ( - rmdir %cache_dir%\third_party /s/q - ) - if %day_now% EQU 01 ( - rmdir %cache_dir%\third_party /s/q - ) -) if "%WITH_TPCACHE%"=="OFF" ( set THIRD_PARTY_PATH=%work_dir:\=/%/%BUILD_DIR%/third_party @@ -395,15 +388,15 @@ if not exist %THIRD_PARTY_PATH% ( echo Getting third party: extracting ... tar -xf %md5%.tar.gz if !ERRORLEVEL! EQU 0 ( - echo Get third party from bos successfully + echo Get third party from bos successfully. ) else ( - echo Get third party failed, reason: extract failed, will build locally + echo Get third party failed, reason: extract failed, will build locally. ) del %md5%.tar.gz ) else ( - echo Get third party failed, reason: download failed, will build locally + echo Get third party failed, reason: download failed, will build locally. ) - if not exist %THIRD_PARTY_PATH% ( set UPLOAD_TP_FILE=ON ) + if not exist %THIRD_PARTY_PATH% set UPLOAD_TP_FILE=ON cd %work_dir%\%BUILD_DIR% ) else ( echo Found reusable third_party cache in %THIRD_PARTY_PATH%, will reuse it. @@ -540,18 +533,18 @@ if "%UPLOAD_TP_FILE%"=="ON" ( tar -zcf %md5%.tar.gz %md5% if !errorlevel! EQU 0 ( echo Uploading third_party: uploading ... - %PYTHON_ROOT%\python.exe %BCE_FILE% %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul + %PYTHON_ROOT%\python.exe !BCE_FILE! %md5%.tar.gz paddle-windows/third_party/%sub_dir% 1>nul if !errorlevel! EQU 0 ( - echo Upload third party to bos paddle-windows/third_party/%sub_dir% successfully + echo Upload third party %md5% to bos paddle-windows/third_party/%sub_dir% successfully. ) else ( - echo Failed upload third party to bos, reason: upload failed + echo Failed upload third party to bos, reason: upload failed. ) ) else ( - echo Failed upload third party to bos, reason: compress failed + echo Failed upload third party to bos, reason: compress failed. ) del %md5%.tar.gz ) else ( - echo Failed upload third party to bos, reason: install bce failed + echo Failed upload third party to bos, reason: install bce failed. ) cd %work_dir%\%BUILD_DIR% ) @@ -627,7 +620,7 @@ git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON pip install -r %work_dir%\python\unittest_py\requirements.txt --user if %ERRORLEVEL% NEQ 0 ( echo pip install unittest requirements.txt failed! - exit /b 7 + exit /b 5 ) for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 0c2580929081d..2cc4bd8d05fb8 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -1076,7 +1076,6 @@ function get_quickly_disable_ut() { function card_test() { set -m - echo "$2 bengingggggg!!!!!" case_count $1 $2 ut_startTime_s=`date +%s` diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index e94805be5a147..d7f9a25ac7a88 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,6 +28,7 @@ int main(int argc, char** argv) { } std::vector envs; + std::vector undefok; #if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB) std::string str_max_body_size; if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size", @@ -40,8 +41,18 @@ int main(int argc, char** argv) { const auto& flag_map = paddle::platform::GetExportedFlagInfoMap(); for (const auto& pair : flag_map) { const std::string& name = pair.second.name; + // NOTE(zhiqiu): some names may not linked in some tests, so add to + // `undefok`. + // One way to handle that is to check each flag item by item, and put it in + // `envs` or `undefok`; + // another way is to add all flags to `envs` and `undeok`, basically it is + // not a good design, + // but it can simplify the procedure of creating new flag and seems no side + // effects. + // see details: https://gflags.github.io/gflags/#special if (pair.second.is_writable) { // means public envs.push_back(name); + undefok.push_back(name); } } @@ -57,6 +68,18 @@ int main(int argc, char** argv) { VLOG(1) << "gtest env_string:" << env_string; } + char* undefok_str = nullptr; + if (undefok.size() > 0) { + std::string undefok_string = "--undefok="; + for (auto t : undefok) { + undefok_string += t + ","; + } + undefok_string = undefok_string.substr(0, undefok_string.length() - 1); + undefok_str = strdup(undefok_string.c_str()); + new_argv.push_back(undefok_str); + VLOG(1) << "gtest undefok_string:" << undefok_string; + } + int new_argc = static_cast(new_argv.size()); char** new_argv_address = new_argv.data(); ::GFLAGS_NAMESPACE::ParseCommandLineFlags( @@ -68,7 +91,7 @@ int main(int argc, char** argv) { #ifdef PADDLE_WITH_ASCEND_CL paddle::platform::AclInstance::Instance().Finalize(); #endif - if (env_str) free(env_str); + if (undefok_str) free(undefok_str); return ret; } diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index e4f0860e3be19..2051a4f6fcd50 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -64,7 +64,6 @@ import paddle.static # noqa: F401 import paddle.vision # noqa: F401 -from .tensor import fft from .tensor.random import bernoulli # noqa: F401 from .tensor.attribute import rank # noqa: F401 @@ -94,20 +93,12 @@ from .tensor.linalg import norm # noqa: F401 from .tensor.linalg import transpose # noqa: F401 from .tensor.linalg import dist # noqa: F401 -from .tensor.linalg import cond # noqa: F401 from .tensor.linalg import t # noqa: F401 from .tensor.linalg import cross # noqa: F401 from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import bmm # noqa: F401 from .tensor.linalg import histogram # noqa: F401 from .tensor.linalg import mv # noqa: F401 -from .tensor.linalg import det # noqa: F401 -from .tensor.linalg import slogdet # noqa: F401 -from .tensor.linalg import multi_dot # noqa: F401 -from .tensor.linalg import matrix_power # noqa: F401 -from .tensor.linalg import svd # noqa: F401 -from .tensor.linalg import pinv # noqa: F401 -from .tensor.linalg import solve # noqa: F401 from .tensor.logic import equal # noqa: F401 from .tensor.logic import greater_equal # noqa: F401 from .tensor.logic import greater_than # noqa: F401 @@ -160,6 +151,7 @@ from .tensor.manipulation import roll # noqa: F401 from .tensor.manipulation import chunk # noqa: F401 from .tensor.manipulation import tolist # noqa: F401 +from .tensor.manipulation import tensordot # noqa: F401 from .tensor.math import abs # noqa: F401 from .tensor.math import acos # noqa: F401 from .tensor.math import asin # noqa: F401 @@ -302,6 +294,7 @@ from .hapi import flops # noqa: F401 from . import hub # noqa: F401 from . import linalg # noqa: F401 +from . import fft # noqa: F401 import paddle.text # noqa: F401 import paddle.vision # noqa: F401 @@ -478,6 +471,7 @@ 'bmm', 'chunk', 'tolist', + 'tensordot', 'greater_than', 'shard_index', 'argsort', @@ -506,7 +500,6 @@ 'stack', 'sqrt', 'cholesky', - 'matrix_power', 'randperm', 'linspace', 'reshape', diff --git a/python/paddle/autograd/__init__.py b/python/paddle/autograd/__init__.py index 89094357b3505..cffc18e95e5ab 100644 --- a/python/paddle/autograd/__init__.py +++ b/python/paddle/autograd/__init__.py @@ -18,5 +18,6 @@ from .py_layer import PyLayer, PyLayerContext # noqa: F401 from ..framework import set_grad_enabled # noqa: F401 from ..fluid.dygraph.base import no_grad_ as no_grad # noqa: F401 +from .functional import vjp, jvp, jacobian, hessian # noqa: F401 __all__ = ['backward', 'PyLayer', 'PyLayerContext'] diff --git a/python/paddle/autograd/functional.py b/python/paddle/autograd/functional.py new file mode 100644 index 0000000000000..688e04335ebb7 --- /dev/null +++ b/python/paddle/autograd/functional.py @@ -0,0 +1,515 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import paddle +from ..fluid import framework +from ..fluid.dygraph import grad +from ..nn.initializer import assign +from ..tensor import reshape, zeros_like, to_tensor +from .utils import _check_tensors, _stack_tensor_or_return_none, _replace_none_with_zero_tensor + + +def to_tensorlist(tl): + if not isinstance(tl, list): + if isinstance(tl, tuple): + tl = list(tl) + else: + tl = [tl] + for t in tl: + assert isinstance(t, paddle.Tensor) or t is None, ( + f'{t} is expected to be paddle.Tensor or None, but found {type(t)}.' + ) + return tl + + +@contextlib.contextmanager +def gradient_scope(*var_lists, create_graph=False, allow_unused=False): + def grad_fn(ys, xs, v, create_graph=create_graph): + assert len(ys) == len(v), ( + f'`v` is expected to be of the same size as the output. ' + f'Here the output is {ys}, and `v` is {v}.') + if allow_unused: + ys = [ + to_tensor( + [0.0], stop_gradient=False) if y is None else y for y in ys + ] + return grad( + ys, xs, v, create_graph=create_graph, allow_unused=allow_unused) + + def return_fn(out): + if isinstance(out, paddle.Tensor): + if not create_graph: + out = out.detach() + return out + if isinstance(out, list): + return list(return_fn(x) for x in out) + elif isinstance(out, tuple): + return tuple(return_fn(x) for x in out) + else: + assert out is None + return out + + def process(vl): + out = [] + # If v is treated as constant in the outer scope, its gradient is guaranteed + # not to be taken beyond this scope. Within this scope, however, v's gradient + # may be computed. We only need to detach v in this case. + # Otherwise, v's gradient is valid, and is subject to update beyond this scope. + # In this case we must not confuse the gradient in the outer scope with the + # inner one's. Moreover, we need to make sure that the result from the inner + # scope can flow back to the outer scope. This can be satisfied by extending + # the original variable with a duplication operation v1 = v so that v still + # maintains the complete lineage. + for v in vl: + if v is None: + out.append(v) + continue + if create_graph and not v.stop_gradient: + v = assign(v) + else: + v = v.detach() + v.stop_gradient = False + out.append(v) + return out + + try: + var_lists = [process(vl) for vl in var_lists] + bundle = var_lists + [grad_fn, return_fn] + yield bundle + finally: + pass + + +@framework.dygraph_only +def vjp(func, inputs, v=None, create_graph=False, allow_unused=False): + r"""Computes the Vector-Jacobian product, a functional form of + reverse mode automatic differentiation. + + Args: + func(Callable): `func` takes as input a tensor or a list + of tensors and returns a tensor or a list of tensors. + inputs(list[Tensor]|Tensor): used as positional arguments + to evaluate `func`. `inputs` is accepted as one tensor + or a list of tensors. + v(list[Tensor]|Tensor, optional): the cotangent vector + invovled in the VJP computation. `v` matches the size + and shape of `func`'s output. Default value is None + and in this case is equivalent to all ones the same size + of `func`'s output. + create_graph(bool, optional): if `True`, gradients can + be evaluated on the results. If `False`, taking gradients + on the results is invalid. Default value is False. + allow_unused(bool, optional): In case that some Tensors of + `inputs` do not contribute to the computation of the output. + If `allow_unused` is False, an error will be raised, + Otherwise, the gradients of the said inputs are returned + None. Default value is False. + + Returns: + output(tuple): + func_out: the output of `func(inputs)` + vjp(list[Tensor]|Tensor): the pullback results of `v` on `func` + + Examples: + .. code-block:: python + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + output, inputs_grad = vjp(func, x) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[4., 4.], + # [4., 4.]])] + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + output, inputs_grad = vjp(func, x, v) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1.], + # [1., 0.]])] + + output, inputs_grad = vjp(func, x, v, create_graph=True) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[2., 1.], + # [1., 0.]])] + + y = paddle.ones(shape=[2, 2], dtype='float32') + def func_unused(x, y): + return paddle.matmul(x, x) + + output, inputs_grad = vjp(func, [x, y], v) + # ValueError: (InvalidArgument) The 1-th input does not appear in the backward graph. + # Please check the input variable or set allow_unused=True to get None result. + # [Hint: Expected allow_unused_ == true, but received allow_unused_:0 != true:1.] + + output, inputs_grad = vjp(func, [x, y], v, allow_unused=True) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1.], + # [1., 0.]]), None] + """ + xs, v = to_tensorlist(inputs), to_tensorlist(v) + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = to_tensorlist(outputs) + grads = grad_fn(ys, xs, v) + outputs, grads = return_fn(outputs), return_fn(grads) + + return outputs, grads + + +@framework.dygraph_only +def jvp(func, inputs, v=None, create_graph=False, allow_unused=False): + r""" + Computes the Jacobian-Vector product for a function at the given + inputs and a vector in the tangent space induced by the inputs. + + .. note:: + **This API is ONLY available in imperative mode.** + + Args: + func(Callable): `func` takes as input a tensor or a list + of tensors and returns a tensor or a list of tensors. + inputs(list[Tensor]|Tensor): used as positional arguments + to evaluate `func`. `inputs` is accepted as one tensor + or a list of tensors. + v(list[Tensor]|Tensor, optional): the tangent vector + invovled in the JVP computation. `v` matches the size + and shape of `inputs`. `v` is Optional if `func` returns + a single tensor. Default value is None and in this case + is equivalent to all ones the same size of `inputs`. + create_graph(bool, optional): if `True`, gradients can + be evaluated on the results. If `False`, taking gradients + on the results is invalid. Default value is False. + allow_unused(bool, optional): In case that some Tensors of + `inputs` do not contribute to the computation of the output. + If `allow_unused` is False, an error will be raised, + Otherwise, the gradients of the said inputs are returned + None. Default value is False. + + Returns: + output(tuple): + func_out: the output of `func(inputs)` + jvp(list[Tensor]|Tensor): the pullback results of `v` on `func` + + Examples: + .. code-block:: python + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + + output, inputs_grad = jvp(func, x) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 2.], + # [2., 2.]])] + + v = paddle.to_tensor([[1.0, 0.0], [0.0, 0.0]]) + output, inputs_grad = vjp(func, x, v) + print(inputs_grad) + # [Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 1.], + # [0., 0.]])] + + """ + xs, v = to_tensorlist(inputs), to_tensorlist(v) + + with gradient_scope( + xs, v, create_graph=create_graph, + allow_unused=allow_unused) as [xs, v, grad_fn, return_fn]: + outputs = func(*xs) + ys = to_tensorlist(outputs) + ys_grad = [zeros_like(y) for y in ys] + xs_grad = grad_fn(ys, xs, ys_grad, create_graph=True) + ys_grad = grad_fn(xs_grad, ys_grad, v) + outputs, ys_grad = return_fn(outputs), return_fn(ys_grad) + + return outputs, ys_grad + + +@framework.dygraph_only +def jacobian(func, inputs, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in imperative mode.** + + This API computes the Jacobian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor or a Tensor tuple. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + Jacobian (Tensor or nested tuple of Tensors): if function ``func`` + takes a Tensor as inputs and returns a Tensor as outputs, Jacobian + will be a single Tensor containing the Jacobian matrix for the + linearized inputs and outputs. If one of the inputs and outputs is + a Tensor, and another is a Tensor list/tuple, then the Jacobian will + be a tuple of Tensors. If both of inputs and outputs are Tensor + list/tuple, then the Jacobian will be a tuple of tuple of Tensors + where ``Jacobian[i][j]`` will contain the Jacobian matrix of the + linearized ``i``th output and ``j``th input and will have same + dtype and device as the corresponding input. ``Jacobian[i][j]`` will + have as size ``m * n``, where ``m`` and ``n`` denote the numbers of + elements of ``i``th output and ``j``th input respectively. + + + Examples 1: + .. code-block:: python + + import paddle + + def func(x): + return paddle.matmul(x, x) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, x) + print(jacobian) + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 2., 0., 1.], + # [1., 0., 2., 1.], + # [0., 1., 1., 2.]]) + + Examples 2: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.matmul(x, y) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') * 2 + x.stop_gradient = False + y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [x, y], create_graph=True) + print(jacobian) + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[2., 2., 0., 0.], + # [2., 2., 0., 0.], + # [0., 0., 2., 2.], + # [0., 0., 2., 2.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=False, + # [[1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.]])) + + Examples 3: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.matmul(x, y), x * x + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') * 2 + x.stop_gradient = False + y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [x, y], allow_unused=True) + print(jacobian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 2., 0., 0.], + # [2., 2., 0., 0.], + # [0., 0., 2., 2.], + # [0., 0., 2., 2.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.]])), + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 0., 0., 0.], + # [0., 2., 0., 0.], + # [0., 0., 2., 0.], + # [0., 0., 0., 2.]]), None)) + + ''' + inputs = _check_tensors(inputs, "inputs") + outputs = _check_tensors(func(*inputs), "outputs") + fin_size = len(inputs) + fout_size = len(outputs) + flat_outputs = tuple(reshape(output, shape=[-1]) for output in outputs) + jacobian = tuple() + for i, flat_output in enumerate(flat_outputs): + jac_i = list([] for _ in range(fin_size)) + for k in range(len(flat_output)): + row_k = grad( + flat_output[k], + inputs, + create_graph=create_graph, + retain_graph=True, + allow_unused=allow_unused) + for j in range(fin_size): + jac_i[j].append( + reshape( + row_k[j], shape=[-1]) + if isinstance(row_k[j], paddle.Tensor) else None) + jacobian += (tuple( + _stack_tensor_or_return_none(jac_i_j) for jac_i_j in jac_i), ) + if fin_size == 1 and fout_size == 1: + return jacobian[0][0] + elif fin_size == 1 and fout_size != 1: + return tuple(jacobian[i][0] for i in range(fout_size)) + elif fin_size != 1 and fout_size == 1: + return jacobian[0] + else: + return jacobian + + +@framework.dygraph_only +def hessian(func, inputs, create_graph=False, allow_unused=False): + ''' + .. note:: + **This API is ONLY available in imperative mode.** + + This API computes the Hessian matrix of `func` with respect to `inputs`. + + Parameters: + func (function): a Python function that takes a Tensor or a Tensor + list/tuple as inputs and returns a Tensor with a single element. + inputs (Tensor|list(Tensor)|tuple(Tensor)): the input Tensor or + Tensor list/tuple of the function ``func``. + create_graph (bool, optional): whether to create the gradient graphs + of the computing process. When it is True, higher order derivatives + are supported to compute; when it is False, the gradient graphs of + the computing process would be discarded. Defaults to ``False``. + allow_unused (bool, optional): whether to raise error or return None if + some Tensors of `inputs` are unreachable in the graph. Error would + be raised if allow_unused=False, and None would be returned as + their gradients if allow_unused=True. Default False. + Returns: + Hessian (Tensor or a tuple of tuple of Tensors): if function ``func`` + takes a Tensor as ``inputs``, Hessian will be a single Tensor containing + the Hessian matrix for the linearized ``inputs`` Tensor. If function + ``func`` takes a Tensor list/tuple as ``inputs``, then the Hessian will + be a tuple of tuple of Tensors where ``Hessian[i][j]`` will contain the + Hessian matrix of the ``i``th input and ``j``th input with size ``m * n``. + Here ``m`` and ``n`` denote the number of elements of the ``i`` th input + and the ``j`` th input respectively. + + Examples 1: + .. code-block:: python + + import paddle + + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + hessian = paddle.autograd.hessian(func, x) + print(hessian) + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 0., 2., 1.], + # [1., 2., 0., 1.], + # [0., 1., 1., 2.]]) + + Examples 2: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [x, y]) + print(hessian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 1., 0., 0.], + # [0., 0., 1., 1.], + # [1., 1., 0., 0.], + # [0., 0., 1., 1.]])), + # (Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[1., 0., 1., 0.], + # [1., 0., 1., 0.], + # [0., 1., 0., 1.], + # [0., 1., 0., 1.]]), + # Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.], + # [0., 0., 0., 0.]]))) + + Examples 3: + .. code-block:: python + + import paddle + + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + x = paddle.ones(shape=[2, 2], dtype='float32') + y = paddle.ones(shape=[2, 2], dtype='float32') + x.stop_gradient = False + y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [x, y], allow_unused=True) + print(hessian) + # ((Tensor(shape=[4, 4], dtype=float32, place=CUDAPlace(0), stop_gradient=True, + # [[2., 1., 1., 0.], + # [1., 0., 2., 1.], + # [1., 2., 0., 1.], + # [0., 1., 1., 2.]]), None), (None, None)) + + ''' + inputs = _check_tensors(inputs, "inputs") + outputs = func(*inputs) + assert isinstance(outputs, paddle.Tensor) and outputs.shape == [ + 1 + ], "The function to compute Hessian matrix should return a Tensor with a single element" + + def jac_func(*ins): + grad_inputs = grad( + outputs, + ins, + create_graph=True, + retain_graph=True, + allow_unused=allow_unused) + return tuple( + _replace_none_with_zero_tensor(grad_inputs[i], inputs[i]) + for i in range(len(inputs))) + + return jacobian( + jac_func, inputs, create_graph=create_graph, allow_unused=allow_unused) diff --git a/python/paddle/autograd/utils.py b/python/paddle/autograd/utils.py new file mode 100644 index 0000000000000..d437f7d82d361 --- /dev/null +++ b/python/paddle/autograd/utils.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle + + +def _check_tensors(in_out_list, name): + assert in_out_list is not None, "{} should not be None".format(name) + + if isinstance(in_out_list, (list, tuple)): + assert len(in_out_list) > 0, "{} connot be empyt".format(name) + for each_var in in_out_list: + assert isinstance( + each_var, + paddle.Tensor), "Elements of {} must be paddle.Tensor".format( + name) + return list(in_out_list) + else: + assert isinstance( + in_out_list, + paddle.Tensor), "{} must be Tensor or list of Tensor".format(name) + return [in_out_list] + + +def _stack_tensor_or_return_none(origin_list): + assert len(origin_list) > 0, "Can't not stack an empty list" + return paddle.stack( + origin_list, axis=0) if isinstance(origin_list[0], + paddle.Tensor) else None + + +def _replace_none_with_zero_tensor(t, spec_t): + if t is None: + zero_t = paddle.zeros(shape=spec_t.shape, dtype=spec_t.dtype) + zero_t.stop_gradient = spec_t.stop_gradient + return zero_t + else: + return t diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py index 4d1934aeed9fb..970fb35bfaeb1 100644 --- a/python/paddle/device/cuda/__init__.py +++ b/python/paddle/device/cuda/__init__.py @@ -27,6 +27,9 @@ 'device_count', 'empty_cache', 'stream_guard', + 'get_device_properties', + 'get_device_name', + 'get_device_capability', ] @@ -204,3 +207,127 @@ def stream_guard(stream): yield finally: stream = _set_current_stream(pre_stream) + + +def get_device_properties(device=None): + ''' + Return the properties of given device. + + Args: + device(paddle.CUDAPlace or int or str): The device, the id of the device or + the string name of device like 'gpu:x' which to get the properties of the + device from. If device is None, the device is the current device. + Default: None. + + Returns: + _gpuDeviceProperties: The properties of the device which include ASCII string + identifying device, major compute capability, minor compute capability, global + memory available and the number of multiprocessors on the device. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + paddle.device.cuda.get_device_properties() + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties(0) + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties('gpu:0') + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + paddle.device.cuda.get_device_properties(paddle.CUDAPlace(0)) + # _gpuDeviceProperties(name='A100-SXM4-40GB', major=8, minor=0, total_memory=40536MB, multi_processor_count=108) + + ''' + + if not core.is_compiled_with_cuda(): + raise ValueError( + "The API paddle.device.cuda.get_device_properties is not supported in " + "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support " + "to call this API.") + + if device is not None: + if isinstance(device, int): + device_id = device + elif isinstance(device, core.CUDAPlace): + device_id = device.get_device_id() + elif isinstance(device, str): + if device.startswith('gpu:'): + device_id = int(device[4:]) + else: + raise ValueError( + "The current string {} is not expected. Because paddle.device." + "cuda.get_device_properties only support string which is like 'gpu:x'. " + "Please input appropriate string again!".format(device)) + else: + raise ValueError( + "The device type {} is not expected. Because paddle.device.cuda." + "get_device_properties only support int, str or paddle.CUDAPlace. " + "Please input appropriate device again!".format(device)) + else: + device_id = -1 + + return core.get_device_properties(device_id) + + +def get_device_name(device=None): + ''' + Return the name of the device which is got from CUDA function `cudaDeviceProp `_. + + Parameters: + device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. + + Returns: + str: The name of the device. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + + paddle.device.cuda.get_device_name() + + paddle.device.cuda.get_device_name(0) + + paddle.device.cuda.get_device_name(paddle.CUDAPlace(0)) + + ''' + + return get_device_properties(device).name + + +def get_device_capability(device=None): + ''' + Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp `_. + + Parameters: + device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. + + Returns: + tuple(int,int): the major and minor revision numbers defining the device's compute capability. + + Examples: + + .. code-block:: python + + # required: gpu + + import paddle + + paddle.device.cuda.get_device_capability() + + paddle.device.cuda.get_device_capability(0) + + paddle.device.cuda.get_device_capability(paddle.CUDAPlace(0)) + + ''' + prop = get_device_properties(device) + return prop.major, prop.minor diff --git a/python/paddle/device/cuda/graphs.py b/python/paddle/device/cuda/graphs.py new file mode 100644 index 0000000000000..612f4d2c8cebd --- /dev/null +++ b/python/paddle/device/cuda/graphs.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.fluid.core import is_compiled_with_cuda, is_compiled_with_rocm, CUDAPlace + +if is_compiled_with_cuda() and not is_compiled_with_rocm(): + from paddle.fluid.core import CUDAGraph as CoreCUDAGraph + + class CUDAGraph: + def __init__(self, place=None, mode="thread_local"): + ALL_MODES = ["global", "thread_local", "relaxed"] + self._graph = None + if place is None: + place = CUDAPlace(0) + self._place = place + assert mode in ALL_MODES + self._mode = ALL_MODES.index(mode) + + def capture_begin(self): + CoreCUDAGraph.begin_capture(self._place, self._mode) + + def capture_end(self): + self._graph = CoreCUDAGraph.end_capture() + + def replay(self): + self._graph.replay() + + def reset(self): + self._graph.reset() +else: + + class CUDAGraph: + def __init__(self, place=None, mode="thread_local"): + raise NotImplementedError() + + def capture_begin(self): + raise NotImplementedError() + + def capture_end(self): + raise NotImplementedError() + + def replay(self): + raise NotImplementedError() + + def reset(self): + raise NotImplementedError() diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py index 5b0fdc1f1f166..31f92e2575a1f 100644 --- a/python/paddle/distributed/auto_parallel/__init__.py +++ b/python/paddle/distributed/auto_parallel/__init__.py @@ -19,5 +19,7 @@ from .interface import set_pipeline_stage # noqa: F401 from .interface import ProcessMesh # noqa: F401 from .completion import complete_annotation # noqa: F401 +from .completion import complete_backward_annotation # noqa: F401 +from .reshard import reshard # noqa: F401 __all__ = [] diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py index 6e886d09d67bd..3fdbad6950db5 100644 --- a/python/paddle/distributed/auto_parallel/completion.py +++ b/python/paddle/distributed/auto_parallel/completion.py @@ -23,6 +23,7 @@ from .utils import print_program_with_distributed_attr from .context import get_default_distributed_context from .operators import find_best_compatible_distributed_operator_impl +from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute ELEMENTWISE_LIKE_OP_LIST = ["elementwise_add", "gelu", "dropout", "cast"] @@ -597,3 +598,172 @@ def sort_key_fun(node): dist_context.amend_distributed_attr_for_program() return program + + +def complete_backward_annotation(auto_parallel_main_prog, dist_context): + """Complete the annotation of vars and ops in the backward phase for parallel program.""" + + def _is_grad_var_name(name): + if "@GRAD" in name: + return True + return False + + grad_start_idx = None + for idx, op in enumerate(auto_parallel_main_prog.global_block().ops): + for var_name in op.output_arg_names: + # TODO: use _is_loss_op to judge + if "@GRAD" in var_name and op.type == "fill_constant": + grad_start_idx = idx + break + assert grad_start_idx is not None, "No backward procedure found in this program." + + ops = list(auto_parallel_main_prog.global_block().ops) + vars = auto_parallel_main_prog.global_block().vars + for idx in range(grad_start_idx, len(ops)): + # complete the loss op + if idx == grad_start_idx: + grad_var = vars[ops[idx].output_arg_names[0]] + grad_var_name = grad_var.name + forward_var_name = grad_var_name[:grad_var_name.find("@GRAD")] + forward_var = vars[forward_var_name] + tensor_attr = TensorDistributedAttribute(grad_var, dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program(grad_var, + tensor_attr) + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + + # in the data parallel mode, the loss op followed by scale op. + if ops[idx + 1].type == "scale" and grad_var_name in ops[idx + 1].input_arg_names \ + and grad_var_name in ops[idx + 1].output_arg_names: + op_attr = OperatorDistributedAttribute(ops[idx + 1], + dist_context) + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx + 1], + op_attr) + continue + + # complete the annotation of the optimizer op. + # TODO: use _is_optimizer_op to judge + if "Grad" in ops[idx].input_names and "Param" in ops[idx].input_names: + assert len(ops[idx].input( + "Param")) == 1, "Only support one-to-one now." + assert len(ops[idx].input( + "Grad")) == 1, "Only support one-to-one now." + var = vars[ops[idx].input("Param")[0]] + grad_var = vars[ops[idx].input("Grad")[0]] + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + var).get_dims_mapping() + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + op_attr.set_process_mesh(process_mesh) + op_attr.set_input_dims_mapping(grad_var.name, dims_mapping) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + continue + + # complete the c_allreduce_sum op for gradient in the data parallel mode. + if ops[idx].type == "c_allreduce_sum" and ops[ + idx].input_arg_names == ops[idx].output_arg_names: + grad_var = vars[ops[idx].output_arg_names[0]] + op_attr = OperatorDistributedAttribute(ops[idx], dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + grad_var).get_process_mesh() + op_attr.set_process_mesh(process_mesh) + dist_context.set_op_distributed_attr_for_program(ops[idx], op_attr) + continue + + # complete the annotation of grad op + grad_op = ops[idx] + for i, op in enumerate(ops[:grad_start_idx]): + match_op = None + grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(op.desc, + set(), + []) + grad_op_input = [] + for input_arg_name in grad_op.desc.input_arg_names(): + if "@GRAD" in input_arg_name: + name = input_arg_name[:input_arg_name.find("@GRAD") + 5] + grad_op_input.append(name) + else: + grad_op_input.append(input_arg_name) + + # like sum op: the count of grad op will larger than 1 + if len(grad_op_desc_list) > 1: + for grad_op_desc in grad_op_desc_list: + if grad_op_input == grad_op_desc.input_arg_names() \ + and grad_op.desc.type() == grad_op_desc.type(): + match_op = op + break + elif len(grad_op_desc_list) == 1: + if grad_op_input == grad_op_desc_list[0].input_arg_names() \ + and grad_op.desc.type() == grad_op_desc_list[0].type(): + match_op = op + + if match_op is not None: + op_attr = dist_context.get_op_distributed_attr_for_program(op) + grad_op_attr = OperatorDistributedAttribute(grad_op, + dist_context) + grad_op_attr.set_process_mesh(op_attr.get_process_mesh()) + for var_name in grad_op.input_arg_names: + if "@GRAD" in var_name: + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + vars[var_name]).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, + dims_mapping) + else: + dims_mapping = op_attr.get_input_dims_mapping(var_name) + grad_op_attr.set_input_dims_mapping(var_name, + dims_mapping) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) + + for var_name in grad_op.output_arg_names: + if "@GRAD" in var_name: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = grad_op_attr.get_process_mesh() + dims_mapping = grad_op_attr.get_input_dims_mapping( + forward_var.name) + tensor_attr.set_process_mesh(process_mesh) + tensor_attr.set_dims_mapping(dims_mapping) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + break + + # complete the annotation of sum op for multiple renamed grad var + if grad_op.type == "sum" and all( + map(_is_grad_var_name, grad_op.input_arg_names)): + assert len(grad_op.output_arg_names + ) == 1, "The output count of sum op should be one." + grad_op_attr = OperatorDistributedAttribute(grad_op, dist_context) + for var_name in grad_op.input_arg_names: + if "@GRAD" in var_name: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + grad_op_attr.set_input_dims_mapping(var_name, dims_mapping) + for var_name in grad_op.output_arg_names: + forward_var = vars[var_name[:var_name.find("@GRAD")]] + tensor_attr = TensorDistributedAttribute(vars[var_name], + dist_context) + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh() + dims_mapping = dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_dims_mapping() + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program( + vars[var_name], tensor_attr) + grad_op_attr.set_process_mesh( + dist_context.get_tensor_distributed_attr_for_program( + forward_var).get_process_mesh()) + dist_context.set_op_distributed_attr_for_program(grad_op, + grad_op_attr) diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py index 4958c5adfae91..5e6565aa3d84c 100644 --- a/python/paddle/distributed/auto_parallel/context.py +++ b/python/paddle/distributed/auto_parallel/context.py @@ -59,6 +59,9 @@ def __init__(self): if self._process_mesh.ndim == 1: self._data_parallel_axis = 0 self._model_parallel_axis = 0 + elif self._process_mesh.ndim == 3: + self._data_parallel_axis = 1 + self._model_parallel_axis = 2 else: self._data_parallel_axis = 0 self._model_parallel_axis = 1 diff --git a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py index 141c3d14a7fb2..3f8fbf9cc3a7a 100644 --- a/python/paddle/distributed/auto_parallel/operators/dist_embedding.py +++ b/python/paddle/distributed/auto_parallel/operators/dist_embedding.py @@ -146,8 +146,18 @@ def static_handle(dst_block, assert mesh_shape <= 2, "row_parallel_embedding only support 1 or 2 dimensional process mesh, but got {}".format( process_mesh_shape) num_partition = process_mesh_shape[embedding_row_dim_mapping] - # TODO generalize here, support any mesh group + # TODO generalize here, support any mesh group + model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( + )._get_model_parallel_info() if mesh_shape == 1: + if rank_id not in process_mesh_group: + assert len( + process_mesh.topology + ) == 2, " row_parallel_embedding process mapping only support 2 dimensional process mesh, \ + but got {}".format(len(process_mesh.topology)) + rank_id = process_mesh_group[ + process_mesh.process_group.index(rank_id) % + process_mesh_shape[0]] relative_idx = process_mesh_group.index(rank_id) else: relative_idx = rank_id % num_partition @@ -156,8 +166,6 @@ def static_handle(dst_block, relative_idx = relative_idx * per_part_size # TODO caculate ring id - model_parallel_axis, process_mesh = op_dist_attr.get_owner_context( - )._get_model_parallel_info() group_ranks = _get_comm_group(process_mesh.process_group, process_mesh.topology, model_parallel_axis, rank_id) diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py index a08da13a39caf..2994d35ef9202 100644 --- a/python/paddle/distributed/auto_parallel/parallelizer.py +++ b/python/paddle/distributed/auto_parallel/parallelizer.py @@ -17,9 +17,10 @@ import paddle.fluid.core as core from .context import DistributedContext from .context import get_default_distributed_context -from .completion import complete_annotation +from .completion import complete_annotation, complete_backward_annotation from .partitioner import Partitioner from .process import get_all_process_groups +from .reshard import reshard class AutoParallelizer: @@ -85,10 +86,16 @@ def parallelize(self, # instantiate communication by process_mapping. all_process_groups = get_all_process_groups() for process_group in all_process_groups: + if rank not in process_group._ranks: + continue process_group.instantiate() # The last step: remove all distributed attributes to be compatiable # with inference. self._remove_distributed_attrs(partitioned_main_prog) + complete_backward_annotation(partitioned_main_prog, self._dist_context) + reshard(partitioned_main_prog, partitioned_startup_prog, rank, + self._dist_context) + return dist_optimize_ops, dist_params_grads, partitioned_startup_prog, partitioned_main_prog diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py new file mode 100644 index 0000000000000..d66d799c6e0f9 --- /dev/null +++ b/python/paddle/distributed/auto_parallel/reshard.py @@ -0,0 +1,1002 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License + +import copy +from functools import reduce + +import paddle +import paddle.fluid.core as core +from paddle.utils import unique_name +from paddle.fluid.layer_helper import LayerHelper +from paddle.fluid.framework import Program, OpProtoHolder +import paddle.fluid.layers.utils as utils +from ..collective import _get_global_env +from .context import DistributedContext +from .attribute import OperatorDistributedAttribute, TensorDistributedAttribute +from .process import new_process_group, ProcessGroup, PROCESS_GROUP_MAP + + +class AllGatherOpDesc: + """ + Describe the allgather op in the reshard phase. + + Args: + group (list): Process group. + """ + + def __init__(self, group): + self._group = group + self._desc = "all_gather" + + @property + def group(self): + return self._group + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, group: {self._group}." + + +class SendOpDesc: + """ + Describe the send op in the reshard phase. + + Args: + partition_index (list): The index of partition in complete tensor. + dst (int): The destination process to receive. + """ + + def __init__(self, partition_index, dst): + self._dst = dst + self._partition_index = partition_index + self._desc = "send" + + @property + def partition_index(self): + return self._partition_index + + @property + def dst(self): + return self._dst + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index: {self._partition_index}, dst: {self._dst}." + + +class RecvOpDesc: + """ + Describe the recv op in the reshard op. + + Args: + partition_index (list): The index of partition in complete tensor. + src (int): The source process to send. + """ + + def __init__(self, partition_index, src): + self._src = src + self._partition_index = partition_index + self._desc = "recv" + + @property + def partition_index(self): + return self._partition_index + + @property + def src(self): + return self._src + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index: {self._partition_index}, src: {self._src}." + + +class SliceOpDesc: + """ + Describe the slice op in the reshard phase. + + Args: + starts (list): It represents starting indices of corresponding axis in ``axes``. + ends (list): It represents ending indices of corresponding axis in ``axes``. + axes (list): Axes that `starts` and `ends` apply to . + """ + + def __init__(self, starts, ends, axes): + self._starts = starts + self._ends = ends + self._axes = axes + self._desc = "slice" + + @property + def starts(self): + return self._starts + + @property + def ends(self): + return self._ends + + @property + def axes(self): + return self._axes + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, starts: {self._starts}, ends: {self._ends}, axes: {self._axes}." + + +class ConcatOpDesc: + """ + Describe the concat op in the reshard phase. + + Args: + partition_index_list (list): A list contains all partition index. + """ + + def __init__(self, partition_index_list): + self._partition_index_list = partition_index_list + self._desc = "concat" + + @property + def partition_index_list(self): + return self._partition_index_list + + @property + def desc(self): + return self._desc + + def __repr__(self): + return f"op: {self._desc}, partition_index_list: {self._partition_index_list}." + + +def _compute_partition_shape(complete_shape, dims_mapping, process_shape): + """Compute the shape of partition.""" + partition_shape = [] + for idx, item in enumerate(complete_shape): + if dims_mapping[idx] == -1: + partition_shape.append(item) + else: + partition_shape.append(item // process_shape[dims_mapping[idx]]) + + return partition_shape + + +def _compute_process_index(process, process_group, process_shape): + """Compute the index of process_shape corresponding to the process.""" + relative_process = process_group.index(process) + process_index = [] + product = reduce(lambda x, y: x * y, process_shape) + + for i in range(len(process_shape)): + idx = relative_process // (product // process_shape[i]) + product = product // process_shape[i] + relative_process = relative_process - relative_process // product * product + process_index.append(idx) + + return process_index + + +def _compute_partition_index(process, complete_shape, dims_mapping, + process_shape, process_group): + """Compute the partition index in complete tensor.""" + partition_shape = _compute_partition_shape(complete_shape, dims_mapping, + process_shape) + process_index = _compute_process_index(process, process_group, + process_shape) + partition_index = [] + + for i in range(len(complete_shape)): + if dims_mapping[i] == -1: + partition_index.append([0, partition_shape[i]]) + else: + partition_index.append([ + process_index[dims_mapping[i]] * partition_shape[i], + (process_index[dims_mapping[i]] + 1) * partition_shape[i] + ]) + + return partition_index + + +def _compute_concat_info(partition_index_x, partition_index_y): + """Judge whether two partition can be concatenated and compute concatenated partition index.""" + differ_count = 0 + concat_axis = -1 + first_order = 0 + new_partition = [] + + for idx, item in enumerate(partition_index_x): + if item != partition_index_y[idx]: + differ_count += 1 + if item[1] == partition_index_y[idx][0] and item[ + 0] < partition_index_y[idx][1]: + concat_axis = idx + new_partition.append([item[0], partition_index_y[idx][1]]) + elif item[0] == partition_index_y[idx][1] and item[ + 1] > partition_index_y[idx][0]: + first_order = 1 + concat_axis = idx + new_partition.append([partition_index_y[idx][0], item[1]]) + else: + new_partition.append(item) + + if differ_count == 1: + return concat_axis, first_order, new_partition + else: + return -1, first_order, new_partition + + +def _concat_partitions(partition_index_list, partition_index): + """Concat the given partitions without inserting concat op.""" + if not partition_index_list: + partition_index_list.append(partition_index) + else: + i = 0 + has_concat = False + while i < len(partition_index_list): + concat_axis, _, new_partition = _compute_concat_info( + partition_index_list[i], partition_index) + if concat_axis != -1: + has_concat = True + partition_index_list.pop(i) + _concat_partitions(partition_index_list, new_partition) + break + i += 1 + if not has_concat: + partition_index_list.append(partition_index) + + +def _is_overlapped(shape_x, shape_y): + """Judge whether two partitions intersect on the specified dimension.""" + overlapped = False + if (shape_y[0] <= shape_x[0] < shape_y[1]) or ( + shape_x[0] <= shape_y[0] < shape_x[1]): + overlapped = True + return overlapped + + +def _need_reshard(tensor_dist_attr, op_dist_attr): + """Judge the tensor whether needs to be resharded.""" + is_reshard = False + tensor_dims_mapping = tensor_dist_attr.get_dims_mapping() + tensor_process_mesh = tensor_dist_attr.get_process_mesh() + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_dist_attr.get_owner_tensor().name) + op_process_mesh = op_dist_attr.get_process_mesh() + if all( + map(lambda x: x is not None, [ + tensor_dims_mapping, tensor_process_mesh, op_input_dims_mapping, + op_process_mesh + ])): + if tensor_dims_mapping != op_input_dims_mapping or tensor_process_mesh._id != op_process_mesh._id: + is_reshard = True + return is_reshard + + +def _compute_complete_shape(slice_shape, process_shape, dims_mapping): + """compute the complete shape of the slice tensor with its process mesh and dims mapping""" + complete_shape = [] + for idx, item in enumerate(slice_shape): + if dims_mapping[idx] == -1: + complete_shape.append(item) + else: + complete_shape.append(item * process_shape[dims_mapping[idx]]) + return complete_shape + + +def find_op_desc_seq(source_tensor, tensor_dist_attr, op_dist_attr): + """ + Find the op description sequence to reshard the source tensor for matching the op requirement. + + Args: + source_tensor (Variable): A tensor with distributed attribute. + tensor_dist_attr (TensorDistributedAttribute): The distributed attribute of tensor. + op_dist_attr (OperatorDistributedAttribute): The distributed attribute of operator. + + Returns: + Dict, the dict represents the required op description sequence corresponding to process, The key of dict is + process and value is a list containing op description. + """ + source_dims_mapping = tensor_dist_attr.get_dims_mapping() + source_process_mesh = tensor_dist_attr.get_process_mesh() + source_process_group = source_process_mesh.process_group + source_process_shape = source_process_mesh.topology + + target_process_mesh = op_dist_attr.get_process_mesh() + target_dims_mapping = op_dist_attr.get_input_dims_mapping( + tensor_dist_attr.get_owner_tensor().name) + target_process_group = target_process_mesh.process_group + target_process_shape = target_process_mesh.topology + + complete_shape = _compute_complete_shape( + source_tensor.shape, source_process_shape, source_dims_mapping) + op_desc_seq = {} + + # TODO: if the target process group has the same process with source process group + if set(target_process_group).intersection(set( + source_process_group)) and set(target_process_group).difference( + set(source_process_group)): + pass + + # in the different process group, it will use send, recv, concat and slice op + elif target_process_group != source_process_group: + partition_process_mapping_list = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index(source_process, complete_shape, source_dims_mapping, \ + source_process_shape, source_process_group) + if not partition_process_mapping_list: + partition_process_mapping_list.append( + [source_partition_index, [source_process], [False]]) + else: + partition_list = list( + [item[0] for item in partition_process_mapping_list]) + process_list = list( + [item[1] for item in partition_process_mapping_list]) + has_used = list( + [item[2] for item in partition_process_mapping_list]) + if partition_list.count(source_partition_index) == 1: + index = partition_list.index(source_partition_index) + process_list[index].append(source_process) + has_used[index].append(False) + else: + partition_process_mapping_list.append( + [source_partition_index, [source_process], [False]]) + + for target_process in target_process_group: + has_sent = [] + target_partition_index = _compute_partition_index( + target_process, complete_shape, target_dims_mapping, + target_process_shape, target_process_group) + partition_index_list = [] + all_partition_index_list = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index( + source_process, complete_shape, source_dims_mapping, + source_process_shape, source_process_group) + to_send_process = None + if all(_ for _ in list(map(_is_overlapped, source_partition_index, target_partition_index))) \ + and source_partition_index not in has_sent: + idx = list([ + item[0] for item in partition_process_mapping_list + ]).index(source_partition_index) + has_used = list( + [item[2] + for item in partition_process_mapping_list])[idx] + process_list = list( + [item[1] + for item in partition_process_mapping_list])[idx] + i = 0 + while i < len(has_used): + if not has_used[i]: + to_send_process = process_list[i] + has_used[i] = True + break + i += 1 + if i == len(has_used): + has_used = list(map(lambda x: False, has_used)) + to_send_process = process_list[0] + has_used[0] = True + assert to_send_process is not None, "Failed to find the send process." + + if to_send_process not in op_desc_seq.keys(): + op_desc_seq[to_send_process] = [] + if target_process not in op_desc_seq.keys(): + op_desc_seq[target_process] = [] + all_partition_index_list.append(source_partition_index) + + # append send and recv op desc + send_op_desc = SendOpDesc(source_partition_index, + target_process) + recv_op_desc = RecvOpDesc(source_partition_index, + to_send_process) + op_desc_seq[to_send_process].append(send_op_desc) + op_desc_seq[target_process].append(recv_op_desc) + has_sent.append(source_partition_index) + _concat_partitions(partition_index_list, + source_partition_index) + + # append concat op desc + op_desc_seq[target_process].append( + ConcatOpDesc(all_partition_index_list)) + + # append slice op desc + slice_starts = [] + slice_ends = [] + slices_axes = [] + concatenated_partition_index = partition_index_list[0] + for idx, item in enumerate(concatenated_partition_index): + slice_starts.append(target_partition_index[idx][0] - item[0]) + slice_ends.append(target_partition_index[idx][1] - item[0]) + slices_axes.append(idx) + op_desc_seq[target_process].append( + SliceOpDesc(slice_starts, slice_ends, slices_axes)) + + # in the same process group, it will use allgahther and slice op + else: + partition_index_list = [] + all_partition_index_list = [] + process_index = [] + for source_process in source_process_group: + source_partition_index = _compute_partition_index( + source_process, complete_shape, source_dims_mapping, + source_process_shape, source_process_group) + if source_partition_index not in partition_index_list: + partition_index_list.append(source_partition_index) + process_index.append( + [[source_process, ], source_partition_index]) + else: + process_index[partition_index_list.index( + source_partition_index)][0].append(source_process) + + for i in range(len(process_index[0][0])): + group = [] + for j in range(len(process_index)): + group.append(process_index[j][0][i]) + if i == 0: + all_partition_index_list.append(process_index[j][1]) + for process in group: + # append slice op desc + slice_starts = [] + slice_ends = [] + slices_axes = [] + target_partition_index = _compute_partition_index( + process, complete_shape, target_dims_mapping, + target_process_shape, target_process_group) + for idx, item in enumerate(target_partition_index): + slice_starts.append(item[0]) + slice_ends.append(item[1]) + slices_axes.append(idx) + + slice_op_desc = SliceOpDesc( + starts=slice_starts, ends=slice_ends, axes=slices_axes) + op_desc_seq[process] = [AllGatherOpDesc(group=group), + ConcatOpDesc(partition_index_list=all_partition_index_list), slice_op_desc] \ + if len(group) > 1 else [slice_op_desc] + + return op_desc_seq + + +def _insert_send_op(block, idx, tensor, dst): + """Insert send op into block at the given index.""" + op_type = 'send_v2' + block._insert_op( + idx, + type=op_type, + inputs={'X': [tensor]}, + attrs={ + 'ring_id': 0, + 'peer': dst, + 'use_calc_stream': True, + }) + + +def _insert_recv_op(block, idx, tensor, src): + """Insert recv op into block at the given index.""" + op_type = 'recv_v2' + block._insert_op( + idx, + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [tensor]}, + attrs={ + 'ring_id': 0, + 'peer': src, + 'out_shape': tensor.shape, + 'dtype': tensor.dtype, + 'use_calc_stream': True, + }) + + +def _insert_concat_op(block, idx, tensors, axis): + """Insert concat op into block at the given block.""" + inputs = {'X': tensors} + attrs = {} + attrs['axis'] = axis + helper = LayerHelper('concat', **locals()) + with paddle.static.program_guard(block.program): + out = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + block._insert_op( + idx, type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs) + return out + + +def _insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name): + """Insert slice op into block at the given block.""" + inputs = {'Input': tensor} + infer_flags = list(1 for i in range(len(axes))) + attrs = { + "axes": axes, + "starts": starts, + "ends": ends, + "infer_flags": infer_flags + } + helper = LayerHelper('slice', **locals()) + out = block.create_var( + name=new_var_name, + dtype=tensor.dtype, + type=core.VarDesc.VarType.LOD_TENSOR) + block._insert_op( + idx, type="slice", inputs=inputs, outputs={'Out': [out]}, attrs=attrs) + return out + + +def _insert_split_op(block, idx, tensor, num_or_sections): + """Insert split op into block at the given index.""" + helper = LayerHelper('split', **locals()) + input_shape = tensor.shape + inputs = {'X': tensor} + attrs = {'num': num_or_sections, "axis": 0} + with paddle.static.program_guard(block.program): + outs = [ + helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) for i in range(num_or_sections) + ] + block._insert_op( + idx, type="split", inputs=inputs, outputs={'Out': outs}, attrs=attrs) + return outs + + +def _insert_allgather_op(block, idx, tensor, ranks): + """Insert allgather op into block at the given index.""" + + def _insert_fill_constant_op(block, idx): + """Insert fill constant op into block at the given index.""" + helper = LayerHelper("fill_constant", **locals()) + with paddle.static.program_guard(block.program): + out = helper.create_variable_for_type_inference(dtype="int32") + inputs = {} + attrs = {'force_cpu': False} + attrs['str_value'] = str(int("1")) + attrs['value'] = int("1") + attrs['dtype'] = out.dtype + utils.get_shape_tensor_inputs( + inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant') + block._insert_op( + idx, + type='fill_constant', + inputs=inputs, + outputs={'Out': [out]}, + attrs=attrs) + out.stop_gradient = True + return out + + tensor_list = [] + group = new_process_group(ranks) + idx_offset = 0 + + # instant process group before insert allgather op. + if not group.is_instantiate(): + # insert fill_constant op + fill_constant_out = _insert_fill_constant_op(block, idx) + fill_constant_out.stop_gradient = True + + # insert c_allreduce_sum op + block._insert_op( + idx + 1, + type="c_allreduce_sum", + inputs={'X': [fill_constant_out]}, + outputs={'Out': [fill_constant_out]}, + attrs={'ring_id': 0, + 'use_calc_stream': True}) + + # insert c_sync_calc_stream op + block._insert_op( + idx + 2, + type="c_sync_calc_stream", + inputs={'X': [fill_constant_out]}, + outputs={'Out': [fill_constant_out]}) + idx_offset = 3 + + # insert c_allgather op + op_type = 'c_allgather' + helper = LayerHelper(op_type, **locals()) + with paddle.static.program_guard(block.program): + allgather_out = helper.create_variable_for_type_inference( + dtype=tensor.dtype) + block._insert_op( + idx + idx_offset, + type=op_type, + inputs={'X': [tensor]}, + outputs={'Out': [allgather_out]}, + attrs={ + 'ring_id': group.id, + 'use_calc_stream': True, + 'nranks': group._nranks + }) + idx_offset += 1 + + # insert split op + split_out = _insert_split_op(block, idx + idx_offset, allgather_out, + group._nranks) + idx_offset += 1 + tensor_list.extend(split_out) + return tensor_list, idx_offset + + +def _concat_partitions_with_op(partition_tensor_list, tensor, partition_index, + block, idx): + """Concat the tensors and insert concat op.""" + if not partition_tensor_list: + partition_tensor_list.append((tensor, partition_index)) + else: + i = 0 + has_concat = False + while i < len(partition_tensor_list): + concat_axis, first_order, new_partition = _compute_concat_info( + partition_tensor_list[i][1], partition_index) + if concat_axis != -1: + has_concat = True + _ = _insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis) \ + if first_order == 0 else \ + _insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis) + partition_tensor_list.pop(i) + idx[0] += 1 + _concat_partitions_with_op(partition_tensor_list, _, + new_partition, block, idx) + break + i += 1 + if not has_concat: + partition_tensor_list.append((tensor, partition_index)) + + +def _init_comm_for_send_recv(): + if not PROCESS_GROUP_MAP["global_group"].is_instantiate(): + PROCESS_GROUP_MAP["global_group"].instantiate() + + +HAS_SENT = {} +HAS_RECV = {} +HAS_ALLGATHER = {} + + +def parse_op_desc(program, rank_id, op_desc_seq, var_name, reshard_op, + dist_context): + """Parse op desc sequence and insert op in the block""" + global HAS_SENT + global HAS_RECV + global HAS_ALLGATHER + tensor_list = [] + partition_tensor_list = [] + if rank_id not in op_desc_seq.keys(): + return + op_desc_list = op_desc_seq[rank_id] + block = program.global_block() + assert var_name in block.vars.keys( + ), "The {} cannot be found in the {} program.".format(var_name, rank_id) + + idx = None + for index, op in list(enumerate(block.ops)): + if op.desc.id == reshard_op.desc.id: + idx = index + break + assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format( + rank_id) + + matched_op = block.ops[idx] + source_tensor = block.vars[var_name] + for op_desc in op_desc_list: + if isinstance(op_desc, AllGatherOpDesc): # noqa: F401 + if var_name not in HAS_ALLGATHER.keys(): + HAS_ALLGATHER[var_name] = [] + if not HAS_ALLGATHER[var_name] or op_desc.group not in list( + map(lambda x: x[0], HAS_ALLGATHER[var_name])): + tensor_list, idx_offset = _insert_allgather_op( + block, idx, source_tensor, op_desc.group) + idx += idx_offset + tensor_name_list = [var.name for var in tensor_list] + HAS_ALLGATHER[var_name].append( + [op_desc.group, tensor_name_list]) + else: + for item in HAS_ALLGATHER[var_name]: + if op_desc.group == item[0]: + tensor_list = [ + program.global_block().vars[var_name] + for var_name in item[1] + ] + break + assert tensor_list, "The result of parsing allgather op should not be None." + + elif isinstance(op_desc, SendOpDesc): + _init_comm_for_send_recv() + if var_name not in HAS_SENT.keys(): + HAS_SENT[var_name] = [] + if op_desc.dst not in HAS_SENT[var_name]: + _insert_send_op(block, idx, source_tensor, op_desc.dst) + idx += 1 + HAS_SENT[var_name].append(op_desc.dst) + + elif isinstance(op_desc, RecvOpDesc): + _init_comm_for_send_recv() + if var_name not in HAS_RECV.keys(): + HAS_RECV[var_name] = {} + if op_desc.src not in HAS_RECV[var_name].keys(): + partition_index = op_desc.partition_index + shape = [] + for index in partition_index: + shape.append(index[1] - index[0]) + recv_tensor = block.create_var( + name=unique_name.generate(var_name + "@recv"), + shape=shape, + dtype=source_tensor.dtype) + _insert_recv_op(block, idx, recv_tensor, op_desc.src) + tensor_list.append(recv_tensor) + idx += 1 + HAS_RECV[var_name][op_desc.src] = recv_tensor + else: + tensor_list.append(HAS_RECV[var_name][op_desc.src]) + + elif isinstance(op_desc, ConcatOpDesc): + partition_index_list = op_desc.partition_index_list + idx_list = [idx] + for index, tensor in enumerate(tensor_list): + _concat_partitions_with_op(partition_tensor_list, tensor, + partition_index_list[index], block, + idx_list) + idx = idx_list[0] + + elif isinstance(op_desc, SliceOpDesc): + assert len(partition_tensor_list) == 1 or not partition_tensor_list + to_slice_tensor = partition_tensor_list[0][0] if len( + partition_tensor_list) == 1 else source_tensor + new_name = unique_name.generate(var_name + "@RESHARD") + target_tensor = _insert_slice_op( + block, + idx, + to_slice_tensor, + starts=op_desc.starts, + ends=op_desc.ends, + axes=op_desc.axes, + new_var_name=new_name) + + tensor_attr = TensorDistributedAttribute(target_tensor, + dist_context) + process_mesh = dist_context.get_op_distributed_attr_for_program( + matched_op).get_process_mesh() + dims_mapping = dist_context.get_op_distributed_attr_for_program( + matched_op).get_input_dims_mapping(var_name) + tensor_attr.set_dims_mapping(dims_mapping) + tensor_attr.set_process_mesh(process_mesh) + dist_context.set_tensor_distributed_attr_for_program(target_tensor, + tensor_attr) + + # rename op input name according to new name + for op in block.ops: + for name in op.input_arg_names: + op_dist_attr = dist_context.get_op_distributed_attr_for_program( + op) + if name == var_name and op_dist_attr is not None: + op_process_mesh = op_dist_attr.get_process_mesh() + op_input_dims_mapping = op_dist_attr.get_input_dims_mapping( + var_name) + if op_process_mesh._id == process_mesh._id and op_input_dims_mapping == dims_mapping: + op.desc._rename_input(name, target_tensor.name) + op_dist_attr.set_input_dims_mapping( + target_tensor.name, dims_mapping) + op_dist_attr._dims_mapping.pop(name, None) + + +def _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id): + """Remove no need ops in the main program""" + not_remove_op_ref = [ + "create_py_reader", "create_double_buffer_reader", "read" + ] + remove_op_idx = [] + block = auto_parallel_main_prog.global_block() + ops = block.ops + vars = block.vars + for idx, op in enumerate(ops): + # handle read op in the pipeline scene specially, it will be removed in the future. + if op.type == "read": + dim_list = [] + for var_name in op.output_arg_names: + dim_list.extend(vars[var_name].shape) + for i in range(idx, -1, -1): + if ops[i].type == "create_py_reader": + ops[i]._set_attr("shape_concat", dim_list) + break + continue + + # replace the input and output of c_sync_comm_stream op when in pipeline scene. + if op.type == "c_sync_comm_stream": + need_save = [] + for var_name in op.input_arg_names: + process_mesh = dist_context.get_tensor_distributed_attr_for_program( + vars[var_name]).get_process_mesh() + if rank_id in process_mesh.process_group: + need_save.append(var_name) + if not need_save: + remove_op_idx.append(idx) + continue + + proto = OpProtoHolder.instance().get_op_proto(op.type) + op.desc.set_input(proto.inputs[0].name, need_save) + op.desc.set_output(proto.outputs[0].name, need_save) + continue + + # judge the other op whether should be removed. + op_dist_attr = dist_context.get_op_distributed_attr_for_program(op) + if op_dist_attr is not None: + op_process_mesh = op_dist_attr.get_process_mesh() + if rank_id not in op_process_mesh.process_group and op.type not in not_remove_op_ref: + remove_op_idx.append(idx) + + for idx in remove_op_idx[::-1]: + block._remove_op(idx) + + +def _remove_no_need_vars(auto_parallel_main_prog): + """Remove no need vars in the main program""" + remove_vars = set() + block = auto_parallel_main_prog.global_block() + ops = block.ops + vars = block.vars + need_vars = set() + for op in ops: + for var_name in op.input_arg_names: + if var_name in vars: + need_vars.add(var_name) + for var_name in op.output_arg_names: + if var_name in vars: + need_vars.add(var_name) + for var in vars: + if var not in need_vars: + remove_vars.add(var) + for var in remove_vars: + block._remove_var(var) + + +def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id): + """Remove no need vars and ops in the main program.""" + _remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id) + _remove_no_need_vars(auto_parallel_main_prog) + + +def remove_no_need_in_startup(auto_parallel_main_prog, + auto_parallel_startup_prog): + """Remove no need vars and ops in the startup program.""" + main_input_vars = set() + main_ops = auto_parallel_main_prog.global_block().ops + for op in main_ops: + for var_name in op.input_arg_names: + main_input_vars.add(var_name) + + startup_block = auto_parallel_startup_prog.global_block() + startup_output_vars = set() + startup_ops = startup_block.ops + for op in startup_ops: + # skip c_sync_comm_stream op + if op.type == "c_sync_comm_stream": + continue + for var_name in op.output_arg_names: + startup_output_vars.add(var_name) + + need_vars = set() + for var_name in startup_output_vars: + if var_name in main_input_vars: + need_vars.add(var_name) + + startup_ops = startup_block.ops + actual_need_vars = set() + for idx, op in enumerate(startup_ops): + is_need_op = False + if op.type == "c_sync_comm_stream": + continue + for var_name in op.output_arg_names: + if var_name in need_vars: + is_need_op = True + break + if is_need_op: + for var_name in op.output_arg_names: + actual_need_vars.add(var_name) + for var_name in op.input_arg_names: + actual_need_vars.add(var_name) + + remove_vars = set() + for var_name in startup_block.vars: + if var_name not in actual_need_vars: + remove_vars.add(var_name) + for var in remove_vars: + startup_block._remove_var(var) + + remove_op_idx = [] + vars = startup_block.vars + for idx, op in enumerate(startup_block.ops): + is_no_need_op = False + if op.type == "c_sync_comm_stream": + var_names = [] + for var_name in op.input_arg_names: + if var_name in vars: + var_names.append(var_name) + if not var_names: + remove_op_idx.append(idx) + else: + proto = OpProtoHolder.instance().get_op_proto(op.type) + op.desc.set_input(proto.inputs[0].name, var_names) + op.desc.set_output(proto.outputs[0].name, var_names) + continue + + for var_name in op.output_arg_names: + if var_name not in vars: + is_no_need_op = True + break + if is_no_need_op: + remove_op_idx.append(idx) + for idx in remove_op_idx[::-1]: + startup_block._remove_op(idx) + + +def reshard(auto_parallel_main_prog, auto_parallel_startup_prog, rank_id, + dist_context): + """ + Reshard tensor in the program according to its dist attr and corresponding op dist attr. + + Args: + auto_parallel_main_prog (Program): An auto parallel main program. + auto_parallel_startup_prog (Program): An auto parallel startup program. + rank_id (int): The process id. + """ + assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \ + "but got {}.".format(type(auto_parallel_main_prog)) + assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_startup_prog should be Program, " \ + "but got {}.".format(type(auto_parallel_startup_prog)) + assert isinstance(rank_id, int), "The type of rank_id should be int, " \ + "but got {}.".format(type(rank_id)) + assert isinstance(dist_context, DistributedContext), "The type of dist_context should be DistributedContext, " \ + "but got {}.".format(type(dist_context)) + + block = auto_parallel_main_prog.global_block() + idx = 0 + while idx < len(block.ops): + pre_op_count = len(block.ops) + op = block.ops[idx] + op_dist_attr = dist_context.get_op_distributed_attr_for_program(op) + if op_dist_attr is not None: + idx_offset = 0 + for var_name in op.input_arg_names: + # skip lod_tensor_blocking_queue_0 + if var_name == "lod_tensor_blocking_queue_0": + continue + var = block.vars[var_name] + tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_program( + var) + if tensor_dist_attr is not None and _need_reshard( + tensor_dist_attr, op_dist_attr): + reshard_op_desc = find_op_desc_seq(var, tensor_dist_attr, + op_dist_attr) + parse_op_desc(auto_parallel_main_prog, rank_id, + reshard_op_desc, var_name, op, dist_context) + cur_op_count = len(block.ops) + idx_offset = idx_offset + cur_op_count - pre_op_count + pre_op_count = cur_op_count + idx = idx + idx_offset + 1 + else: + idx += 1 + + # remove no need vars and ops in the main program + remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id) + + # remove no need vars and ops in the startip program + remove_no_need_in_startup(auto_parallel_main_prog, + auto_parallel_startup_prog) diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py index f0f26bd2e0d06..28260d7aa1863 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/__init__.py @@ -12,5 +12,6 @@ # See the License for the specific language governing permissions and from .hybrid_parallel_optimizer import HybridParallelOptimizer from .hybrid_parallel_gradscaler import HybridParallelGradScaler +from .dygraph_sharding_optimizer import DygraphShardingOptimizer __all__ = [] diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py index 581fbc5153ad4..6cd875905864b 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py @@ -50,7 +50,9 @@ def __init__(self, clip, hcg): @imperative_base.no_grad def _dygraph_clip(self, params_grads): params_and_grads = [] - sum_square_list = [] + sum_square_list_dist = [] + sum_square_list_not_dist = [] + for p, g in params_grads: if g is None: continue @@ -62,18 +64,49 @@ def _dygraph_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows(merge_grad) square = layers.square(merge_grad) sum_square = layers.reduce_sum(square) - sum_square_list.append(sum_square) - - # all parameters have been filterd out - if len(sum_square_list) == 0: - return params_grads - global_norm_var = layers.concat(sum_square_list) - global_norm_var = layers.reduce_sum(global_norm_var) - # add all reduce to get global norm in world size - paddle.distributed.all_reduce(global_norm_var, - self._hcg.get_check_parallel_group()) - global_norm_var = layers.sqrt(global_norm_var) + not_shared_enable = (not hasattr(p, 'is_firstly_shared')) or ( + hasattr(p, 'is_firstly_shared') and + getattr(p, 'is_firstly_shared', True)) + + if not_shared_enable: + if p.is_distributed: + sum_square_list_dist.append(sum_square) + else: + sum_square_list_not_dist.append(sum_square) + + global_norm_var_dist = layers.concat(sum_square_list_dist) if len( + sum_square_list_dist) != 0 else layers.concat( + [paddle.to_tensor([0.])]) + global_norm_var_dist = layers.reduce_sum(global_norm_var_dist) + + global_norm_var_not_dist = layers.concat( + sum_square_list_not_dist) if len( + sum_square_list_not_dist) != 0 else layers.concat( + [paddle.to_tensor([0.])]) + global_norm_var_not_dist = layers.reduce_sum(global_norm_var_not_dist) + + # add all reduce to get global norm of distributed params_and_grads + if self._hcg.get_model_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_dist, + group=self._hcg.get_check_parallel_group()) + + # add all reduce to get global norm of non-distributed params_and_grads in groups of pp + if self._hcg.get_pipe_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_not_dist, + group=self._hcg.get_pipe_parallel_group()) + + # In Sharding mode, param and grad is mapping different rank in optimizer. + # ClipGradByGlobalNorm need allreduce to get globol norm + if self._hcg.get_sharding_parallel_world_size() > 1: + paddle.distributed.all_reduce( + global_norm_var_not_dist, + group=self._hcg.get_sharding_parallel_group()) + + global_norm_var = layers.sqrt(global_norm_var_dist + + global_norm_var_not_dist) max_global_norm = layers.fill_constant( shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm) @@ -96,7 +129,7 @@ def __getattr__(self, item): return getattr(self._clip, item) def __call__(self, params_grads): - return self._clip(params_grads) + return self._dygraph_clip(params_grads) class HybridParallelOptimizer: @@ -112,7 +145,7 @@ def __init__(self, optimizer, hcg, strategy): self._need_dp = (self._hcg.get_data_parallel_world_size() > 1) # NOTE(shenliang03): Because of the pure DataParallel mode, the gradient synchronization - # is achieved through reducer, so there is no need to call fuse_allreduce in oprimizer. + # is achieved through reducer, so there is no need to call fuse_allreduce in optimizer. self._dp_enable = not self._use_dp_mode and self._need_dp self._sharding_enable = ( @@ -120,11 +153,16 @@ def __init__(self, optimizer, hcg, strategy): if isinstance(self._inner_opt._grad_clip, ClipGradByGlobalNorm) and not self._use_dp_mode: - logger.warning("using ClipGradByGlobalNorm in TensorParallel, the origin " \ - "optmizer'grad clip will be changed.") - - self._inner_opt._grad_clip = HybridParallelClipGrad( - self._inner_opt._grad_clip, hcg) + logger.warning("While using ClipGradByGlobalNorm in TensorParallel, PipelineParallel " \ + "or Sharding, the grad clip of original optimizer will be changed.") + + if self._sharding_enable: + # change sharding inner_optimizer's _grad_clip + self._inner_opt._inner_optimizer._grad_clip = HybridParallelClipGrad( + self._inner_opt._grad_clip, hcg) + else: + self._inner_opt._grad_clip = HybridParallelClipGrad( + self._inner_opt._grad_clip, hcg) @imperative_base.no_grad @framework.dygraph_only diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index 7d899cff41871..c8eaa54f9cda1 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -460,6 +460,8 @@ def __get_ouputs_name_to_idx(self, first_backward_idx, block): if is_optimizer_op(op): break for name in op.output_arg_names: + if name == core.kEmptyVarName(): + continue var = block.var(name) if not outputs_name_to_idx.get(var): # if the grad only be generated by one op diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py index 3816e9b3051ab..bb6af1b3195f7 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/offload_helper.py @@ -12,8 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +import copy from ..common import is_optimizer_op, OP_ROLE_KEY, OpRole, is_update_op from paddle.fluid import core, unique_name +from .shard import Shard __all__ = [] @@ -23,11 +25,9 @@ class OffloadHelper(object): cuda_place_type = 1 cuda_pinned_place_type = 2 - def __init__(self): - pass - "0: dst is on CPUPlace. " - "1: dst is on CUDAPlace. " - "2: dst is on CUDAPinnedPlace. " + def __init__(self, mp_ring_id=None, dp_ring_id=None): + self.mp_ring_id = mp_ring_id + self.dp_ring_id = dp_ring_id def _insert_cast_op(self, block, idx, src_name, dst_name): src_var = block.var(src_name) @@ -50,6 +50,32 @@ def _insert_cast_op(self, block, idx, src_name, dst_name): OP_ROLE_KEY: OpRole.Optimize }) + def _insert_broadcast_op(self, block, idx, param_name): + rings = [] + + if self.dp_ring_id is not None: + rings.append(self.dp_ring_id) + + # need sync non distributed param in mp group + if self.mp_ring_id is not None: + param = block.var(param_name) + if not hasattr(param, 'is_distributed') or not param.is_distributed: + rings.append(self.mp_ring_id) + + # the insert op order is: mp, dp + for ring in rings: + block._insert_op_without_sync( + idx, + type="c_broadcast", + inputs={'X': param_name}, + outputs={'Out': param_name}, + attrs={ + 'ring_id': ring, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward, + }) + def _insert_memcpy_op(self, block, idx, src_name, dst_name, dst_place_type): src_var = block.var(src_name) dst_var = block.var(dst_name) @@ -206,6 +232,8 @@ def remove_param(input_name): # step5: startup_block add offload visited_vars = set() + # FIXME(wangxi): should insert in idx, need move comm init to the head. + insert_idx = len(startup_block.ops) for idx, op in reversed(list(enumerate(startup_block.ops))): for out_name in op.output_arg_names: if out_name in visited_vars: @@ -213,13 +241,16 @@ def remove_param(input_name): if out_name in param_name_to_offload_name: var_name = out_name - # FIXME(wangxi): offload should insert after broadcast param if offload: offload_var_name = param_name_to_offload_name[var_name] - self._insert_offload_op(startup_block, idx + 1, + self._insert_offload_op(startup_block, insert_idx, var_name, offload_var_name) - self._insert_cast_op(startup_block, idx + 1, var_name, + self._insert_cast_op(startup_block, insert_idx, var_name, param_to_fp16[var_name]) + # NOTE(wangxi): cast and offload should insert after broadcast param. + # the insert op order is: {mp, dp}broadcast, cast, offload + self._insert_broadcast_op(startup_block, insert_idx, + var_name) visited_vars.add(out_name) @@ -303,3 +334,183 @@ def offload(self, block, startup_block): block._sync_with_cpp() startup_block._sync_with_cpp() + + def opt_sharding_cast_fp32param(self, + block, + startup_block, + params, + offload=False): + """ + (p_fp16) = cast(p) + (p_fp16_recompute) = cast(p) + (pout,) = adam(p) + ===========================> + rename(p_fp16_recompute, p_fp16) + + (pout,) = adam(p) + (p_fp16) = cast(p) + broadcast(p_fp16) + """ + global_params = set() + local_params = set() + param_to_fp16 = dict() + # recompute_var which need rename to fp16_param + fp16_param_to_recompute = dict() + recompute_to_fp16 = dict() + + def remove_param(input_name): + global_params.remove(input_name) + if input_name in local_params: + local_params.remove(input_name) + if input_name in param_to_fp16: + fp16_param = param_to_fp16.pop(input_name) + if fp16_param in fp16_param_to_recompute: + recompute = fp16_param_to_recompute.pop(fp16_param) + recompute_to_fp16.pop(recompute) + + # step1: record param + global_params = set(params) + for idx, op in reversed(list(enumerate(block.ops))): + if is_update_op(op): + param = op.desc.input("Param")[0] + local_params.add(param) + + # step2: remove param which can't offload and + # record param->fp16param, fp16param->recompute_var + for idx, op in enumerate(block.ops): + if is_optimizer_op(op): + break + # TODO (Yuang Liu): tmp solution for fuse_grad_merge + optimize_cast + if op.type == 'coalesce_tensor': + continue + for input_name in op.desc.input_arg_names(): + if input_name not in global_params: + continue + + # param which will be used by fp32 op + if op.type != 'cast': + remove_param(input_name) + continue + + # param is only used by cast op, + # which to cast fp32_param to fp16_param + output_name = op.output_arg_names[0] + if 'cast_fp16' not in output_name: + remove_param(input_name) + continue + + if 'subprog' not in output_name: + assert output_name == input_name + '.cast_fp16' + assert input_name not in param_to_fp16, \ + "There must be only one cast op from fp32 param to fp16 param." + param_to_fp16[input_name] = output_name + else: + # fp16-->recompute_var + assert input_name in param_to_fp16, \ + "param must first be cast to fp16" + fp16_param = param_to_fp16[input_name] + fp16_param_to_recompute[fp16_param] = output_name + recompute_to_fp16[output_name] = fp16_param + + param_name_to_offload_name = dict() + # step3: main_block add offload, cast op + # change recompute to fp16, remove cast(param) to fp16 + for idx, op in reversed(list(enumerate(block.ops))): + if is_update_op(op): + param = op.desc.input("Param")[0] + if param not in global_params: + continue + # step3.1: create offload_var + offload_var_name = self._get_offload_var_name(param) + param_name_to_offload_name[param] = offload_var_name + if offload: + self._create_offload_var(param, offload_var_name, + [block, startup_block]) + + # step3.2: insert cast op and offload op + self._insert_offload_op(block, idx + 1, param, + offload_var_name) + + assert param in param_to_fp16 + fp16_param_name = param_to_fp16[param] + fp16_param_var = block.var(fp16_param_name) + fp16_param_var.persistable = True + self._insert_cast_op(block, idx + 1, param, + param_to_fp16[param]) + + if offload: + # step3.3: insert fetch op + self._insert_fetch_op(block, idx, offload_var_name, param) + + continue + + # step3.4: remove cast op + if op.type == 'cast': + input_name = op.desc.input_arg_names()[0] + if input_name in global_params: + block._remove_op(idx, sync=False) + continue + + # step3.5: change recompute_param to fp16_param + for input_name in op.desc.input_arg_names(): + if input_name in recompute_to_fp16: + op._rename_input(input_name, recompute_to_fp16[input_name]) + for output_name in op.desc.output_arg_names(): + if output_name in recompute_to_fp16: + op._rename_output(output_name, + recompute_to_fp16[output_name]) + + # step4: remove recompute_param + for name in recompute_to_fp16.keys(): + block._remove_var(name, sync=False) + + # step5: remove fp32 param which not need + for idx, op in enumerate(block.ops): + if op.type not in ['coalesce_tensor', 'c_broadcast']: + continue + for input_name in op.desc.input_arg_names(): + if input_name in param_to_fp16: + op._rename_input(input_name, param_to_fp16[input_name]) + for output_name in op.desc.output_arg_names(): + if output_name in param_to_fp16: + op._rename_output(output_name, param_to_fp16[output_name]) + + for param in global_params: + assert param in param_to_fp16 + fp16_param_name = param_to_fp16[param] + fp16_param_var = block.var(fp16_param_name) + fp16_param_var.persistable = True + + if param not in local_params: + block._remove_var(param, sync=False) + + # step6: startup_block add offload + visited_vars = set() + insert_idx = len(startup_block.ops) + for idx, op in reversed(list(enumerate(startup_block.ops))): + for out_name in op.output_arg_names: + if out_name in visited_vars: continue + + if out_name in param_to_fp16: + var_name = out_name + if offload: + self._insert_offload_op( + startup_block, idx + 1, var_name, + param_name_to_offload_name[var_name]) + + self._insert_cast_op(startup_block, insert_idx, var_name, + param_to_fp16[var_name]) + + # NOTE(wangxi): cast and offload should insert after broadcast param. + # the insert op order is: {mp, dp}broadcast, cast, offload + self._insert_broadcast_op(startup_block, insert_idx, + var_name) + + if var_name not in local_params: + param = startup_block.var(out_name) + param.persistable = False + + visited_vars.add(out_name) + + block._sync_with_cpp() + startup_block._sync_with_cpp() diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py index 0b8f67a0a7cd9..447b52ace6978 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py @@ -14,7 +14,7 @@ import paddle from paddle.fluid import core, unique_name from functools import reduce -from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op +from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op, is_backward_op, is_optimizer_op from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY import re @@ -366,6 +366,24 @@ def insert_allreduce_ops(block, class FuseHelper(object): + @staticmethod + def sort_vars_by_dtype(block, vars_name): + fp32_vars = [] + fp16_vars = [] + other_vars = [] + for var in vars_name: + dtype = block.var(var).dtype + if dtype == paddle.float32: + fp32_vars.append(var) + elif dtype == paddle.float16: + fp16_vars.append(var) + else: + other_vars.append(var) + assert len(other_vars) == 0, "only support fp32/fp16 vars for fuse" + + fp32_vars.extend(fp16_vars) + return fp32_vars + @staticmethod def get_fused_groups(block, vars_name, fuse_size=32.): """ coalesce tensor, get fused group """ @@ -639,6 +657,54 @@ def insert_broadcast_param_ops(block, return param_in_this_device +def fuse_opt_broadcast_param_ops(block, + ring_id, + shard, + op_role=OpRole.Optimize, + strategy=None): + """ + fuse optimizer sharding broadcast param ops + """ + if strategy is None or not strategy.fuse_all_reduce_ops: + return + + fuse_size = strategy.fuse_grad_size_in_MB + + nranks = shard.worker_num + device_to_vars = [[] for _ in range(nranks)] + + for idx, op in reversed(list(enumerate(block.ops))): + if not is_optimizer_op(op) or op.type != 'c_broadcast': + break + var = op.input_arg_names[0] + root_id = op.attr('root') + device_to_vars[root_id].insert(0, var) + block._remove_op(idx, sync=False) + + insert_idx = idx + 1 + for root_id, vars_name in enumerate(device_to_vars): + vars_name = FuseHelper.sort_vars_by_dtype(block, vars_name) + groups = FuseHelper.get_fused_groups(block, vars_name, fuse_size) + + fused_vars, insert_num = FuseHelper.insert_coalesce_tensor( + block, insert_idx, groups, op_role, prefix="Param") + + for fused_var in fused_vars: + block._insert_op_without_sync( + insert_idx + insert_num, + type='c_broadcast', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={ + 'ring_id': ring_id, + 'root': root_id, + 'use_calc_stream': True, + OP_ROLE_KEY: op_role + }) + + block._sync_with_cpp() + + def get_grad_device(grad_name, shard): assert "@GRAD" in grad_name, "[{}] should be a grad variable.".format( grad_name) diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index 1af646b3959e0..18211459a4e08 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -329,6 +329,7 @@ def _insert_allreduce_for_pp(self, params_grads): if self.pp_degree == 1: return strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() @@ -399,6 +400,8 @@ def _insert_allreduce_for_pp(self, params_grads): first_optimize_op_index += (len(main_block.ops) - len_of_ops) len_of_ops = len(main_block.ops) + # NOTE(wangxi): we fused after optimize_cast + optimize_cast = sharding_configs['optimize_cast'] optimizer_param = utils.insert_broadcast_param_ops( main_block, len_of_ops, @@ -407,10 +410,10 @@ def _insert_allreduce_for_pp(self, params_grads): OpRole.Optimize, use_calc_stream=True, rank=self.dp_rank, - strategy=strategy) + strategy=None if optimize_cast else strategy) logger.info("Optimizer param in this rank {}".format( optimizer_param)) - if not strategy.fuse_grad_merge: + if not strategy.fuse_grad_merge and not optimize_cast: assert len(accumulated_grad_names) == len(optimizer_param) elif self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": insert_allreduce_ops( @@ -458,18 +461,22 @@ def _insert_loss_grad_scale_op(self): main_block._sync_with_cpp() - def _apply_optimize_offload_pass(self): + def _apply_optimize_offload_pass(self, params_grads): strategy = self.user_defined_strategy sharding_configs = strategy.sharding_configs main_block = self._main_program.global_block() startup_block = self._startup_program.global_block() + mp_ring_id = self.mp_ring_id if self.mp_degree > 1 else None + dp_ring_id = self.dp_ring_id if self.dp_degree > 1 else None + offload_helper = OffloadHelper( + mp_ring_id=mp_ring_id, dp_ring_id=dp_ring_id) + # optimize offload should be enable while gradient merge is enable and # acc_step is quite large (e.g. >> 100). Since its memcpy could not be # overlap with calc, otherwise it will slower down training severely. if sharding_configs["optimize_offload"]: logger.info("Sharding with optimize offload !") - offload_helper = OffloadHelper() offload_helper.offload(main_block, startup_block) # The optimize_cast is already included in offload_fp32param offload_helper.offload_fp32param(main_block, startup_block) @@ -477,8 +484,16 @@ def _apply_optimize_offload_pass(self): logger.info("Sharding with optimize cast !") # NOTE(wangxi): optimize_cast will persist fp16 param, it # will take more memory, but will be faster. Trade space for time. - offload_helper = OffloadHelper() - offload_helper.cast_fp32param_in_optimize(main_block, startup_block) + if self._optimizer_sharding: + offload_helper.opt_sharding_cast_fp32param( + main_block, startup_block, + [x[0].name for x in params_grads]) + # NOTE(wangxi): fused after optimize_cast + utils.fuse_opt_broadcast_param_ops( + main_block, dp_ring_id, self._shard, strategy=strategy) + else: + offload_helper.cast_fp32param_in_optimize(main_block, + startup_block) def _dump_program_for_debug(self): main_block = self._main_program.global_block() @@ -525,7 +540,7 @@ def minimize_impl(self, self._insert_loss_grad_scale_op() # apply optimize offload or optimize cast - self._apply_optimize_offload_pass() + self._apply_optimize_offload_pass(params_grads) # step6: (optional) sharding gradient merge self._sharding_gradient_merge() @@ -540,6 +555,10 @@ def minimize_impl(self, # init param broadcast should be called after startup pruning self._initialization_broadcast() + # NOTE(wangxi): if param is not persistable, program.clone will + # failed, so we remove no persistable param, recreate param as a var + self._recreate_not_persist_param_as_var() + self._dump_program_for_debug() # GPU need to wait server ready, GPU and NPU is Layered connection @@ -1371,42 +1390,96 @@ def _build_groups(self): return + def _recreate_not_persist_param_as_var(self): + def recreate_not_persist_param_as_var(program): + block = program.global_block() + params = block.all_parameters() + for param in params: + if param.persistable: + continue + + name = param.name + shape = param.shape + dtype = param.dtype + type = param.type + lod_level = param.lod_level + stop_gradient = param.stop_gradient + trainable = param.trainable + optimize_attr = param.optimize_attr + regularizer = param.regularizer + have_dist_attr = False + is_distributed = False + if hasattr(param, 'is_distributed'): + have_dist_attr = True + is_distributed = param.is_distributed + + block._remove_var(name, sync=False) + var = block.create_var( + name=name, + shape=shape, + dtype=dtype, + type=type, + lod_level=lod_level, + stop_gradient=stop_gradient, + trainable=trainable, + persistable=False) + if have_dist_attr: + var.is_distributed = is_distributed + + block._sync_with_cpp() + + recreate_not_persist_param_as_var(self._startup_program) + recreate_not_persist_param_as_var(self._main_program) + def _initialization_broadcast(self): """ - this funtion is to ensure the initialization between dp group to be - identical when hybrid-dp is used. + this funtion is to ensure the initialization between dp group to be + identical when hybrid-dp is used, and the initialization of + not distributed param between mp group to be identical. """ - if not self.hybrid_dp: + if self.dp_degree <= 1 and self.mp_degree <= 1: return startup_block = self._startup_program.global_block() + params = startup_block.all_parameters() + params_name = [] + not_dist_param_name = set() - broadcast_params = [] for param in params: - broadcast_params.append(param) - # optimize_cast need broadcast fp16 param - fp16_param_name = param.name + '.cast_fp16' - if startup_block.has_var(fp16_param_name): - fp16_param = startup_block.var(fp16_param_name) - broadcast_params.append(fp16_param) - - for param in broadcast_params: - startup_block.append_op( - type='c_broadcast', - inputs={'X': param}, - outputs={'Out': param}, - attrs={ - 'ring_id': self.dp_ring_id, - 'root': 0, - OP_ROLE_KEY: OpRole.Forward - }) - startup_block.append_op( - type='c_sync_comm_stream', - inputs={'X': broadcast_params}, - outputs={'Out': broadcast_params}, - attrs={'ring_id': self.dp_ring_id, - OP_ROLE_KEY: OpRole.Forward}) + params_name.append(param.name) + if not hasattr(param, 'is_distributed') or not param.is_distributed: + not_dist_param_name.add(param.name) + + # offload and optimize_cast will insert broadcast op + broadcast_params = set() + for op in startup_block.ops: + if op.type == 'c_broadcast': + broadcast_params.add(op.desc.output_arg_names()[0]) + + for param in params_name: + if param in broadcast_params: continue + + rings = [] + # need sync not distributed param in mp group + if self.mp_degree > 1 and param in not_dist_param_name: + rings.append(self.mp_ring_id) + if self.dp_degree > 1: + rings.append(self.dp_ring_id) + + for ring in rings: + startup_block.append_op( + type='c_broadcast', + inputs={'X': param}, + outputs={'Out': param}, + attrs={ + 'ring_id': ring, + 'root': 0, + 'use_calc_stream': True, + OP_ROLE_KEY: OpRole.Forward + }) + + startup_block._sync_with_cpp() # sharding gradient merge def create_persistable_gradients_and_insert_merge_ops( diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py index 2555d73462b78..2ce8cf7bdeb74 100644 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py @@ -70,7 +70,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False def forward(self, x): if self.is_mp: @@ -135,7 +135,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False if has_bias: # initialize bias to zero like Megatron @@ -144,7 +144,7 @@ def __init__(self, attr=paddle.nn.initializer.Constant(value=0.0), dtype=self._dtype, is_bias=True) - self.bias.is_distributed = True + self.bias.is_distributed = True if self.is_mp else False else: self.bias = None @@ -212,7 +212,7 @@ def __init__(self, dtype=self._dtype, is_bias=False) - self.weight.is_distributed = True + self.weight.is_distributed = True if self.is_mp else False if has_bias: self.bias = self.create_parameter( diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py index db6fc964895ff..9920bbd400c70 100755 --- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py +++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py @@ -261,6 +261,10 @@ def _synchronize_shared_weights(self): src=min(comm['ranks']), group=comm['group']) + for param in comm['layer'].parameters(): + if self.global_rank != min(comm['ranks']): + setattr(param, 'is_firstly_shared', False) + def allreduce_shared_weight_gradients(self): for key, comm in self.shared_comm.items(): param = getattr(self.shared_layers[key], comm['weight_attr']) @@ -316,6 +320,9 @@ def _build_layer(self): self.shared_layers[layer.layer_name] = layer.build_layer() self.shared_weight_attrs[ layer.layer_name] = layer.shared_weight_attr + for param in self.shared_layers[ + layer.layer_name].parameters(): + setattr(param, "is_firstly_shared", True) if layer.forward_func is None: self.run_function.append(self.shared_layers[ diff --git a/python/paddle/distributed/fleet/utils/fs.py b/python/paddle/distributed/fleet/utils/fs.py index fb518f62a1269..f56580f8ca2fe 100644 --- a/python/paddle/distributed/fleet/utils/fs.py +++ b/python/paddle/distributed/fleet/utils/fs.py @@ -468,10 +468,17 @@ def __init__( self._bd_err_re = re.compile( r'\s?responseErrorMsg\s?\:.*, errorCode\:\s?[0-9]+, path\:') - def _run_cmd(self, cmd, redirect_stderr=False): + def _run_cmd(self, cmd, redirect_stderr=False, retry_times=5): exe_cmd = "{} -{}".format(self._base_cmd, cmd) - ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr) - ret = int(ret) + ret = 0 + output = None + retry_sleep_second = 3 + for x in range(retry_times + 1): + ret, output = core.shell_execute_cmd(exe_cmd, 0, 0, redirect_stderr) + ret = int(ret) + if ret == 0: + break + time.sleep(retry_sleep_second) if ret == 134: raise FSShellCmdAborted(cmd) return ret, output.splitlines() @@ -1106,3 +1113,35 @@ def _split_files(self, files, trainer_id, trainers): begin += blocks[i] return trainer_files[trainer_id] + + def list_files_info(self, path_list): + """ + list_files return file path and size + Args: + path_list(list): file list + Returns: + fileist(list): file list with file path and size + """ + if len(path_list) <= 0: + return [] + + file_list = [] + + #concat filelist can speed up 'hadoop ls' + str_concat = "" + for path in path_list: + str_concat += path + " " + cmd = "ls " + str_concat + " | awk '{if ($8 != \"\") {print $5\" \"$8 }}'" + ret, lines = self._run_cmd(cmd) + if (len(lines) == 0): + logger.warning("list_files empty, path[%s]" % path_list) + return [] + for line in lines: + arr = line.split(' ') + if len(arr) < 2: + continue + file_path = arr[1] + file_size = int(arr[0]) + file_list.append({'path': file_path, 'size': file_size}) + + return file_list diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils.py index 6d14b30d18c7f..63585e167e8e3 100644 --- a/python/paddle/distributed/utils.py +++ b/python/paddle/distributed/utils.py @@ -489,9 +489,6 @@ def __ne__(self, pod): def parse_response(self, res_pods): pass - def rank(self): - return self.rank - def get_visible_gpus(self): r = "" for g in self.gpus: diff --git a/python/paddle/fft.py b/python/paddle/fft.py new file mode 100644 index 0000000000000..3ac02c9c8dc18 --- /dev/null +++ b/python/paddle/fft.py @@ -0,0 +1,61 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .tensor.fft import fft # noqa: F401 +from .tensor.fft import fft2 # noqa: F401 +from .tensor.fft import fftn # noqa: F401 +from .tensor.fft import ifft # noqa: F401 +from .tensor.fft import ifft2 # noqa: F401 +from .tensor.fft import ifftn # noqa: F401 +from .tensor.fft import rfft # noqa: F401 +from .tensor.fft import rfft2 # noqa: F401 +from .tensor.fft import rfftn # noqa: F401 +from .tensor.fft import irfft # noqa: F401 +from .tensor.fft import irfft2 # noqa: F401 +from .tensor.fft import irfftn # noqa: F401 +from .tensor.fft import hfft # noqa: F401 +from .tensor.fft import hfft2 # noqa: F401 +from .tensor.fft import hfftn # noqa: F401 +from .tensor.fft import ihfft # noqa: F401 +from .tensor.fft import ihfft2 # noqa: F401 +from .tensor.fft import ihfftn # noqa: F401 +from .tensor.fft import fftfreq # noqa: F401 +from .tensor.fft import rfftfreq # noqa: F401 +from .tensor.fft import fftshift # noqa: F401 +from .tensor.fft import ifftshift # noqa: F401 + +__all__ = [ # noqa + 'fft', + 'fft2', + 'fftn', + 'ifft', + 'ifft2', + 'ifftn', + 'rfft', + 'rfft2', + 'rfftn', + 'irfft', + 'irfft2', + 'irfftn', + 'hfft', + 'hfft2', + 'hfftn', + 'ihfft', + 'ihfft2', + 'ihfftn', + 'fftfreq', + 'rfftfreq', + 'fftshift', + 'ifftshift' +] diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 8bf27f6d2fd98..7aa3c888f2ad1 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -197,13 +197,18 @@ def modify_forward_desc_for_recompute(self): if op.desc.has_attr(op_device_attr_name): op_device = op.desc.attr(op_device_attr_name) + # Setting the force_cpu of seed to true will make the output of seed in cpu memory, + # reduce the synchronous copy from GPU to CPU in dropout, and reduce the communication hang added_op = self.block._insert_op( index=op.idx, type='seed', inputs={}, outputs={'Out': [added_var]}, - attrs={'seed': seed, - 'op_device': op_device}) + attrs={ + 'seed': seed, + 'op_device': op_device, + 'force_cpu': True + }) self.ops.insert(op_idx, added_op) # modify dropout op desc so that it accept a seed var as input op.desc.set_input("Seed", [var_unique_name]) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 5a9ea1a445e2d..4cca41b527bc2 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -28,6 +28,7 @@ from .data_feeder import check_variable_and_dtype from .framework import in_dygraph_mode from .layer_helper import LayerHelper +from .framework import default_main_program __all__ = [ 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue', @@ -547,7 +548,12 @@ def _static_clip(self, params_grads): scale_input = (scale_var.astype('float16') if g.dtype == core.VarDesc.VarType.FP16 else scale_var) - p.block.append_op( + # NOTE(Yuang Liu): For pure dp with gradient merge, the p and g + # will be in different blocks with the gradient clip related ops. + # We need to handle the correct block, otherwise will encounter + # a 'NotFoundError' during compile time. + block = default_main_program().current_block() + block.append_op( type='elementwise_mul', inputs={'X': g, 'Y': scale_input}, diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py index bb030cbac1bea..a72ea4d9b8510 100644 --- a/python/paddle/fluid/contrib/sparsity/utils.py +++ b/python/paddle/fluid/contrib/sparsity/utils.py @@ -518,9 +518,13 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4): t = t.reshape(shape[0], shape[1]) elif len(shape) == 3: t = t.reshape(shape[0] * shape[1], shape[2]) - # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op elif len(shape) == 4: - t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3], + shape[2]) + mask = func(t, n=n, m=m) + return mask.reshape([shape[0], shape[1], shape[3], + shape[2]]).transpose([0, 1, 3, 2]).astype(dtype) else: raise ValueError("The dimension of input tensor is not supported in create_mask, " \ "Only dimension < 4 is supported but got {}".format(len(shape))) @@ -572,9 +576,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4): t = t.reshape(shape[0], shape[1]) elif len(shape) == 3: t = t.reshape(shape[0] * shape[1], shape[2]) - # 4d-tensor conv (out, in, h, w) -> (out, in*h*w) in GemmConvKernel Op + # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op elif len(shape) == 4: - t = t.reshape(shape[0], shape[1] * shape[2] * shape[3]) + t = t.transpose([0, 1, 3, 2]).reshape( + [shape[0] * shape[1] * shape[3], shape[2]]) else: raise ValueError("The dimension of input tensor is not supported in create_mask, " \ "Only dimension < 4 is supported but got {}".format(len(shape))) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 438831208b66a..d683e36fbe5ab 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -396,6 +396,8 @@ def set_feed_type(self, data_feed_type): Set data_feed_desc """ self.proto_desc.name = data_feed_type + if (self.proto_desc.name == "SlotRecordInMemoryDataFeed"): + self.dataset = core.Dataset("SlotRecordDataset") @deprecated( since="2.0.0", diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py index c8e1370e44772..18052fa7d4da8 100644 --- a/python/paddle/fluid/dygraph/base.py +++ b/python/paddle/fluid/dygraph/base.py @@ -414,7 +414,7 @@ def grad(outputs, no_grad_vars=None): ''' .. note:: - **This API is ONLY available in Dygraph mode.** + **This API is ONLY available in imperative mode.** This API computes the sum of gradients of `outputs` with respect to each `inputs` . diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 4c7537d8d5c8e..bea5b29ecafa6 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -23,7 +23,8 @@ from .wrapped_decorator import signature_safe_contextmanager import six from .data_feeder import convert_dtype -from .framework import Program, default_main_program, Variable, Operator, convert_np_dtype_to_dtype_ +from .framework import Program, default_main_program, Variable, Operator +from .framework import convert_np_dtype_to_dtype_, get_flags from . import core from . import unique_name from . import compiler @@ -1016,7 +1017,16 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, check_feed_shape_type(var, feed_tensor, exe.device_count()) feed_tensor_dict[feed_name] = feed_tensor - exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) + #TODO(zhhsplendid): handle other feed data format case for CINN + use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] + if use_cinn: + fetch_var_names = list(map(_to_name_str, fetch_list)) + fetch_tensors = exe.run_from_cinn( + feed_tensor_dict, fetch_var_names)._move_to_list() + return as_numpy( + fetch_tensors) if return_numpy else fetch_tensors + else: + exe.feed_and_split_tensor_into_local_scopes(feed_tensor_dict) elif isinstance(feed, list) or isinstance(feed, tuple): res = list() for i, each in enumerate(feed): @@ -1036,6 +1046,8 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, check_feed_shape_type(var, tensor) res_dict[feed_name] = tensor res.append(res_dict) + + use_cinn = get_flags("FLAGS_use_cinn")["FLAGS_use_cinn"] exe.feed_tensors_into_local_scopes(res) if hasattr(program._program, 'lr_sheduler'): @@ -1044,9 +1056,15 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name, lr_value = lr_sheduler() lr_var = program._program.global_block().vars[lr_sheduler._var_name] lr_tensor = _as_lodtensor(lr_value, core.CPUPlace(), lr_var.dtype) - exe.feed_and_split_tensor_into_local_scopes({ - lr_sheduler._var_name: lr_tensor - }) + if core.is_cuda_graph_capturing(): + warnings.warn( + "Caution!!! When capturing CUDA Graph, the learning rate scheduler would not " + "take any effect! Please set the learning rate manually before each batch!" + ) + else: + exe.feed_and_split_tensor_into_local_scopes({ + lr_sheduler._var_name: lr_tensor + }) fetch_var_names = list(map(_to_name_str, fetch_list)) tensors = exe.run(fetch_var_names, return_merged)._move_to_list() diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 11e7e7c2f7c08..4d90b9159470e 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -55,6 +55,7 @@ 'is_compiled_with_cuda', 'is_compiled_with_rocm', 'is_compiled_with_xpu', + 'is_compiled_with_npu', 'Variable', 'require_version', 'device_guard', @@ -380,6 +381,15 @@ def _xpu_ids(): return device_ids +def _npu_ids(): + npus_env = os.getenv("FLAGS_selected_npus") + if npus_env: + device_ids = [int(s) for s in npus_env.split(",")] + else: + device_ids = six.moves.range(core.get_npu_device_count()) + return device_ids + + def is_compiled_with_xpu(): """ Whether this whl package can be used to run the model on XPU. @@ -395,6 +405,21 @@ def is_compiled_with_xpu(): return core.is_compiled_with_xpu() +def is_compiled_with_npu(): + """ + Whether this whl package can be used to run the model on NPU. + + Returns (bool): support npu or not. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + support_npu = fluid.is_compiled_with_npu() + """ + return core.is_compiled_with_npu() + + def disable_signal_handler(): """ Reset signal handler registered by Paddle. @@ -538,6 +563,47 @@ def xpu_places(device_ids=None): return [core.XPUPlace(dev_id) for dev_id in device_ids] +def npu_places(device_ids=None): + """ + **Note**: + For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device. + + This function creates a list of :code:`paddle.NPUPlace` objects. + If :code:`device_ids` is None, environment variable of + :code:`FLAGS_selected_npus` would be checked first. For example, if + :code:`FLAGS_selected_npus=0,1,2`, the returned list would + be [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + If :code:`FLAGS_selected_npus` is not set, all visible + npu places would be returned. + If :code:`device_ids` is not None, it should be the device + ids of NPUs. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be + [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)]. + + Parameters: + device_ids (list or tuple of int, optional): list of NPU device ids. + Returns: + list of paddle.NPUPlace: Created NPU place list. + Examples: + .. code-block:: python + + # required: npu + + import paddle + import paddle.static as static + + paddle.enable_static() + npu_places = static.npu_places() + """ + assert core.is_compiled_with_npu(), \ + "Not compiled with NPU" + if device_ids is None: + device_ids = _npu_ids() + elif not isinstance(device_ids, (list, tuple)): + device_ids = [device_ids] + return [core.NPUPlace(dev_id) for dev_id in device_ids] + + def cpu_places(device_count=None): """ This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. @@ -1927,6 +1993,10 @@ def set_value(self, value, scope=None): p = core.Place() p.set_place(t._place()) place = core.XPUPlace(p.xpu_device_id()) + elif p.is_npu_place(): + p = core.Place() + p.set_place(t._place()) + place = core.NPUPlace(p.npu_device_id()) else: p = core.Place() p.set_place(t._place()) @@ -3956,6 +4026,23 @@ def all_op_nodes(self): """ return {IrOpNode(node) for node in self.graph.nodes() if node.is_op()} + def all_sub_graphs(self, for_test=False): + """ + Return all sub_graphs included in the main graph as a set. + """ + + return [ + IrGraph( + self.graph.get_sub_graph(i), for_test=for_test) + for i in range(self.graph.sub_graph_size()) + ] + + def get_sub_graph(self, i, for_test=False): + """ + Return i-th sub_graph in the main graph. + """ + return IrGraph(self.graph.get_sub_graph(i), for_test=for_test) + def create_persistable_node(self, name, var_type, shape, var_dtype): """ Create a persistable variable node in the graph. In IrGraph, @@ -4102,8 +4189,10 @@ def link_to(self, node_in, node_out): node_in(IrNode): the input node. node_out(IrNode): the output node. """ - assert node_in.node in self.graph.nodes() and node_out.node in self.graph.nodes(), \ - 'The two arguments(node_in&node_out) must be in the graph nodes.' + assert node_in.node in self.graph.nodes(), ( + 'node_in(%s) must be in the graph nodes.' % node_in.node.name()) + assert node_out.node in self.graph.nodes(), ( + 'node_out(%s) must be in the graph nodes.' % node_out.node.name()) node_in.append_output(node_out) node_out.append_input(node_in) @@ -4265,7 +4354,8 @@ def _find_node_by_name(self, nodes, node_name): for n in nodes: if n.name() == node_name: target_node = n - assert target_node is not None, "Cannot find the target node in the giving set." + assert target_node is not None, ( + "Cannot find the target node (%s)in the giving set." % node_name) return target_node def _update_desc_attr(self, desc, name, val): @@ -5074,11 +5164,7 @@ def _prune_with_input(self, feeded_var_names, targets): else: target_op = op - if target_op is None: - raise ValueError( - "The target variable used for pruning should have an " - "associated operator that generates it.") - else: + if target_op is not None: targets_idx.append([target_op.block.idx, target_op.idx]) else: targets_idx.append([t.block.idx, t.idx]) diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index a5e508d0a0def..77f9ab33c4c34 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -383,7 +383,7 @@ def _worker_num(self): return the current number of worker """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) return 0 def _server_num(self): @@ -391,30 +391,30 @@ def _server_num(self): return the current number of server """ if self._check_role_generation(): - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def worker_index(self): """ return the index of worker """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / 2 + return int(self._get_size() / 2) def server_index(self): """ return the index of server """ if self._check_role_generation(): - return self._rank / self._proc_per_node + return int(self._rank / self._proc_per_node) else: self.generate_role() - return self._get_size() / self._proc_per_node + return int(self._get_size() / self._proc_per_node) def _all_reduce(self, input, output, mode="sum"): """ @@ -612,6 +612,7 @@ def __init__(self, **kwargs): # set running status of http server self._http_server_d["running"] = False self._iface = self.__get_default_iface() + self._iface = "" if self._iface == "lo" else self._iface # this environment variable can be empty self._prefix = os.getenv("SYS_JOB_ID", "") diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py index 39cf3ebeb32a9..78af7fd65dccb 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py @@ -270,6 +270,7 @@ def stop_worker(self): self._role_maker._barrier_worker() if self._role_maker.is_first_worker(): self._fleet_ptr.stop_server() + if self._heter_ptr: self._heter_ptr.stop_xpu_service() self._role_maker._barrier_worker() self._role_maker._barrier_all() @@ -327,6 +328,21 @@ def print_table_stat(self, table_id): self._fleet_ptr.print_table_stat(table_id) self._role_maker._barrier_worker() + def set_file_num_one_shard(self, table_id, file_num): + """ + set file_num in one shard + Args: + table_id(int): the id of table + file_num(int): file num in one shard + Example: + .. code-block:: python + fleet.set_file_num_one_shard(0, 5) + """ + self._role_maker._barrier_worker() + if self._role_maker.is_first_worker(): + self._fleet_ptr.set_file_num_one_shard(table_id, file_num) + self._role_maker._barrier_worker() + def save_persistables(self, executor, dirname, main_program=None, **kwargs): """ save presistable parameters, @@ -1075,7 +1091,8 @@ def minimize(self, scopes=None, startup_programs=None, parameter_list=None, - no_grad_set=None): + no_grad_set=None, + program_mode="all_reduce"): """ minimize a program through loss, loss can be a list in DistributedOptimizer. Note that in parameter server mode, a worker will not get anything about optimize_os @@ -1089,6 +1106,7 @@ def minimize(self, in `parameter_list`. parameter_list (list): list of Variables to update. no_grad_set (set|None): set of Variables should be ignored. + program_mode (str|"all_reduce"): grad action for grogram when use_ps_gpu. Returns: tuple: (optimize_ops, params_grads) which are, list of operators appended; and list of (param, grad) Variables pair for optimization. @@ -1123,12 +1141,17 @@ def minimize(self, if opt_info["use_ps_gpu"]: from paddle.fluid.transpiler.collective import MultiThread # check start program - + if program_mode not in [ + "all_reduce", "fuse_all_reduce", "all_gather" + ]: + raise ValueError("You should set program_mode in [ all_reduce, \ + fuse_all_reduce, all_gather ]") env = self.get_dist_env() if not isinstance(losses, list): startup_programs = [startup_programs] for i in range(0, len(startup_programs)): - t = MultiThread() + + t = MultiThread(trans_mode=program_mode) start_program = startup_programs[i] main_program = programs[i] t.transpile( diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py index e2fb29c5439e1..56d476210894e 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py @@ -846,7 +846,7 @@ def _minimize(self, "user_define_dump_filename", "") opt_info["dump_fields_path"] = strategy.get("dump_fields_path", "") opt_info["dump_param"] = strategy.get("dump_param", []) - gpus_env = os.getenv("FLAGS_selected_gpus") + gpus_env = os.getenv("FLAGS_selected_gpus", "0") opt_info["worker_places"] = [int(s) for s in gpus_env.split(",")] opt_info["use_ps_gpu"] = strategy.get("use_ps_gpu", False) if server._server.downpour_server_param.downpour_table_param[ diff --git a/python/paddle/fluid/incubate/fleet/utils/hdfs.py b/python/paddle/fluid/incubate/fleet/utils/hdfs.py index fe09692531ad3..e5b2129e857f4 100644 --- a/python/paddle/fluid/incubate/fleet/utils/hdfs.py +++ b/python/paddle/fluid/incubate/fleet/utils/hdfs.py @@ -25,8 +25,8 @@ import time import logging import six -from . import fs -from .fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted +#from . import fs +from paddle.distributed.fleet.utils.fs import FS, LocalFS, FSFileExistsError, FSFileNotExistsError, ExecuteError, FSTimeOut, FSShellCmdAborted from paddle.fluid import core import functools diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index f050b3995be96..e110c47d790f1 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -1426,7 +1426,8 @@ def save_inference_model(dirname, main_program.global_block().create_var( name=target_v.name, shape=target_v.shape, - dtype=target_v.dtype) + dtype=target_v.dtype, + persistable=target_v.persistable) prepend_feed_ops(main_program, feeded_var_names) append_fetch_ops(main_program, fetch_var_names) diff --git a/python/paddle/fluid/ir.py b/python/paddle/fluid/ir.py index 17b7ea1122ab7..7e2d3df1ce1e4 100644 --- a/python/paddle/fluid/ir.py +++ b/python/paddle/fluid/ir.py @@ -230,9 +230,6 @@ def __init__(self, type=None): self._type = type def __getattr__(self, name): - if self._type is not None: - raise AttributeError( - "type object 'OpHelper' has no attribute '{}'".format(name)) op = PassDesc.OpHelper(name) op.Init() return op @@ -261,7 +258,12 @@ def Init(self): self._op_idx = len(block.ops) self._op_desc = block.desc.append_op() self._op_desc.set_type(self._type) - self._op_proto = OpProtoHolder.instance().get_op_proto(self._type) + self._op_proto = OpProtoHolder.instance().op_proto_map.get( + self._type) + if self._op_proto is None: + raise AttributeError( + "type object 'OpHelper' has no attribute '{}'".format( + self._type)) block.ops.append(self) def Attr(self, name): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 515d4a5c0ef7c..75b0392ab6ae4 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -10315,6 +10315,8 @@ def unstack(x, axis=0, num=None): if in_dygraph_mode(): if num == None: num = x.shape[axis] + if num == 0: + return [] return _C_ops.unstack(x, num, 'axis', int(axis), 'num', num) helper = LayerHelper('unstack', **locals()) diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index 24076e82b0365..4625d7ea89b25 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -2047,11 +2047,15 @@ def _create_accumulators(self, block, parameters): def _append_optimize_op(self, block, param_and_grad): assert isinstance(block, framework.Block) _lars_weight_decay = self._lars_weight_decay + _lars_coeff = self._lars_coeff param_name = param_and_grad[0].name + is_excluded = False if len(self._exclude_from_weight_decay) > 0: for name in self._exclude_from_weight_decay: if name in param_name: _lars_weight_decay = 0.0 + _lars_coeff = 0.0 + is_excluded = True break velocity_acc = self._get_accumulator(self._velocity_acc_str, @@ -2065,7 +2069,7 @@ def _append_optimize_op(self, block, param_and_grad): attrs = { "mu": self._momentum, - "lars_coeff": self._lars_coeff, + "lars_coeff": _lars_coeff, "lars_weight_decay": _lars_weight_decay, "multi_precision": find_master, "rescale_grad": self._rescale_grad @@ -2086,7 +2090,7 @@ def _append_optimize_op(self, block, param_and_grad): # create the momentum optimize op momentum_op = block.append_op( - type=self.type, + type='momentum' if is_excluded else self.type, inputs=inputs, outputs=outputs, attrs=attrs, diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 4b887da838257..9d6a1d00cff60 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -21,6 +21,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer) list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute) list(APPEND DIST_TEST_OPS test_fleet_raw_program_meta_optimizer) +list(APPEND DIST_TEST_OPS test_rnn_dp) list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer) list(APPEND DIST_TEST_OPS test_gen_nccl_id_op) list(APPEND DIST_TEST_OPS test_parallel_dygraph_unused_variables) @@ -66,6 +67,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer_with_recompute) list(APPEND MIXED_DIST_TEST_OPS test_fleet_raw_program_meta_optimizer) +list(APPEND MIXED_DIST_TEST_OPS test_rnn_dp) list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer) list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_init) list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer) @@ -84,6 +86,10 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_auto) list(APPEND MIXED_DIST_TEST_OPS test_fleet_static_mp_layers) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner) list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_partitioner_gpt) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_serial) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_mppp) +list(APPEND MIXED_DIST_TEST_OPS test_auto_parallel_reshard_dpmppp) foreach(TEST_OP ${MIXED_DIST_TEST_OPS}) list(REMOVE_ITEM TEST_OPS ${TEST_OP}) endforeach() @@ -223,6 +229,10 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner) LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_partitioner_gpt) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_serial) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_mppp) + LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_reshard_dpmppp) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -454,6 +464,11 @@ list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while) # disable this unittest temporarily list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception) +# disable sparse_attention which not in suitable env +if ( (NOT WITH_GPU) OR (WIN32) OR (PADDLE_WITH_ARM) OR (WITH_ROCM) ) + list(REMOVE_ITEM TEST_OPS test_sparse_attention_op) +endif() + if (APPLE OR WIN32) list(REMOVE_ITEM TEST_OPS test_dataset) list(REMOVE_ITEM TEST_OPS test_dataset_dataloader) @@ -587,6 +602,10 @@ if(WITH_DISTRIBUTE) py_test_modules(test_fleet_lamb_meta_optimizer MODULES test_fleet_lamb_meta_optimizer ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner MODULES test_auto_parallel_partitioner ENVS ${dist_ENVS}) py_test_modules(test_auto_parallel_partitioner_gpt MODULES test_auto_parallel_partitioner_gpt ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard MODULES test_auto_parallel_reshard ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_serial MODULES test_auto_parallel_reshard_serial ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_mppp MODULES test_auto_parallel_reshard_mppp ENVS ${dist_ENVS}) + py_test_modules(test_auto_parallel_reshard_dpmppp MODULES test_auto_parallel_reshard_dpmppp ENVS ${dist_ENVS}) endif(NOT WIN32) endif(NOT APPLE) if(WITH_DGC) @@ -702,6 +721,7 @@ endif() add_subdirectory(sequence) add_subdirectory(dygraph_to_static) add_subdirectory(rnn) +add_subdirectory(autograd) if (NOT WIN32 OR NOT WITH_GPU) add_subdirectory(fft) @@ -1039,3 +1059,4 @@ if(WITH_GPU OR WITH_ROCM) endif() set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120) set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400) +set_tests_properties(test_tensordot PROPERTIES TIMEOUT 1000) diff --git a/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt new file mode 100644 index 0000000000000..369134c8989a0 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/CMakeLists.txt @@ -0,0 +1,10 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") +set(GC_ENVS FLAGS_eager_delete_tensor_gb=0.0) + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS ${GC_ENVS}) +endforeach(TEST_OP) + +set_tests_properties(test_jacobian PROPERTIES TIMEOUT 20) +set_tests_properties(test_hessian PROPERTIES TIMEOUT 50) diff --git a/python/paddle/fluid/tests/unittests/autograd/test_hessian.py b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py new file mode 100644 index 0000000000000..120a6c853e8d8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_hessian.py @@ -0,0 +1,140 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +from utils import _compute_numerical_hessian + + +class TestHessian(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-2 + self.rtol = 1e-2 + self.atol = 1e-2 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x) + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + + def test_multi_input(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, y)) + + numerical_hessian = _compute_numerical_hessian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [self.x, self.y]) + for i in range(len(hessian)): + for j in range(len(hessian[0])): + assert np.allclose(hessian[i][j].numpy(), + numerical_hessian[i][j], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + hessian = paddle.autograd.hessian( + func, [self.x, self.y], allow_unused=True) + for i in range(len(hessian)): + for j in range(len(hessian[0])): + if i == j == 0: + assert np.allclose(hessian[i][j].numpy(), + numerical_hessian[i][j], self.rtol, + self.atol) + else: + assert hessian[i][j] is None + + def test_create_graph_false(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x) + assert hessian.stop_gradient == True + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + try: + paddle.grad(hessian, self.x) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + # TODO(levi): enable this test case when matmul_grad_grad_grad is ok + def _test_create_graph_true(self): + def func(x): + return paddle.sum(paddle.matmul(x, x)) + + numerical_hessian = _compute_numerical_hessian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + hessian = paddle.autograd.hessian(func, self.x, create_graph=True) + assert hessian.stop_gradient == False + assert np.allclose(hessian.numpy(), numerical_hessian[0][0], self.rtol, + self.atol) + triple_grad = paddle.grad(hessian, self.x) + assert triple_grad is not None + + +class TestHessianFloat64(TestHessian): + @classmethod + def setUpClass(self): + self.shape = (2, 2) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-5 + self.rtol = 1e-5 + self.atol = 1e-5 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py new file mode 100644 index 0000000000000..2f0b8c7cad3e5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_jacobian.py @@ -0,0 +1,162 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +import paddle +import paddle.compat as cpt +from utils import _compute_numerical_jacobian + + +class TestJacobian(unittest.TestCase): + @classmethod + def setUpClass(self): + self.shape = (4, 4) + self.dtype = 'float32' + self.np_dtype = np.float32 + self.numerical_delta = 1e-4 + self.rtol = 1e-3 + self.atol = 1e-3 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + def test_single_input_and_single_output(self): + def func(x): + return paddle.matmul(x, x) + + numerical_jacobian = _compute_numerical_jacobian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, self.x) + assert np.allclose(jacobian.numpy(), numerical_jacobian[0][0], + self.rtol, self.atol) + + def test_single_input_and_multi_output(self): + def func(x): + return paddle.matmul(x, x), x * x + + numerical_jacobian = _compute_numerical_jacobian( + func, self.x, self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, self.x) + for i in range(len(jacobian)): + assert np.allclose(jacobian[i].numpy(), numerical_jacobian[i][0], + self.rtol, self.atol) + + def test_multi_input_and_single_output(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for j in range(len(jacobian)): + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + + def test_multi_input_and_multi_output(self): + def func(x, y): + return paddle.matmul(x, y), x * y + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for i in range(len(jacobian)): + for j in range(len(jacobian[0])): + assert np.allclose(jacobian[i][j].numpy(), + numerical_jacobian[i][j], self.rtol, + self.atol) + + def test_allow_unused_false(self): + def func(x, y): + return paddle.matmul(x, x) + + try: + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + except ValueError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("allow_unused") > 0 + + def test_allow_unused_true(self): + def func(x, y): + return paddle.matmul(x, x) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian( + func, [self.x, self.y], allow_unused=True) + assert np.allclose(jacobian[0].numpy(), numerical_jacobian[0][0], + self.rtol, self.atol) + assert jacobian[1] is None + + def test_create_graph_false(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian(func, [self.x, self.y]) + for j in range(len(jacobian)): + assert jacobian[j].stop_gradient == True + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + try: + paddle.grad(jacobian[0], [self.x, self.y]) + except RuntimeError as e: + error_msg = cpt.get_exception_message(e) + assert error_msg.find("has no gradient") > 0 + + def test_create_graph_true(self): + def func(x, y): + return paddle.matmul(x, y) + + numerical_jacobian = _compute_numerical_jacobian( + func, [self.x, self.y], self.numerical_delta, self.np_dtype) + self.x.stop_gradient = False + self.y.stop_gradient = False + jacobian = paddle.autograd.jacobian( + func, [self.x, self.y], create_graph=True) + for j in range(len(jacobian)): + assert jacobian[j].stop_gradient == False + assert np.allclose(jacobian[j].numpy(), numerical_jacobian[0][j], + self.rtol, self.atol) + double_grad = paddle.grad(jacobian[0], [self.x, self.y]) + assert double_grad is not None + + +class TestJacobianFloat64(TestJacobian): + @classmethod + def setUpClass(self): + self.shape = (4, 4) + self.dtype = 'float64' + self.np_dtype = np.float64 + self.numerical_delta = 1e-7 + self.rtol = 1e-7 + self.atol = 1e-7 + self.x = paddle.rand(shape=self.shape, dtype=self.dtype) + self.y = paddle.rand(shape=self.shape, dtype=self.dtype) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py new file mode 100644 index 0000000000000..86331d36a3ca8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/test_vjp_jvp.py @@ -0,0 +1,294 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle + +from paddle.autograd.functional import vjp, jvp, to_tensorlist +from paddle import grad, ones_like, zeros_like + + +def reduce(x): + return paddle.sum(x) + + +def reduce_dim(x): + return paddle.sum(x, axis=0) + + +def matmul(x, y): + return paddle.matmul(x, y) + + +def mul(x, y): + return x * y + + +def pow(x, y): + return paddle.pow(x, y) + + +def o2(x, y): + return paddle.multiply(x, y), paddle.matmul(x, y.t()) + + +def unuse(x, y): + return paddle.sum(x) + + +def nested(x): + def inner(y): + return x * y + + return inner + + +def make_v(f, inputs): + outputs = to_tensorlist(f(*inputs)) + return [ones_like(x) for x in outputs] + + +class TestAutogradFunctional(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls.RAW_INPUTS = { + 'a': [1.0], + 'b': [1.0, 2.0], + 'c': [3.0, 4.0], + 'd': [[2.0], [3.0]], + 'A': [[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]], + 'B': [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0]], + } + + def setUp(self): + pass + + def gen_input(self, inp, stop_gradient=False): + if isinstance(inp, paddle.Tensor): + return inp + return paddle.to_tensor( + self.RAW_INPUTS[inp], stop_gradient=stop_gradient) + + def gen_inputs(self, inputs): + if isinstance(inputs, list): + inputs = [self.gen_input(x) for x in inputs] + else: + inputs = [self.gen_input(inputs)] + return inputs + + def gen_test_pairs(self, + func, + inputs, + v=None, + create_graph=False, + allow_unused=False): + def vjp_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs, inputs_grad = vjp(func, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + outputs, inputs_grad = vjp(func, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, inputs_grad + + def grad_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs = func(*xs) + if v is not None: + inputs_grad = grad( + outputs, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + inputs_grad = grad( + outputs, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, inputs_grad + + return vjp_test, grad_test + + def gen_jvp_tests(self, + func, + inputs, + v=None, + create_graph=False, + allow_unused=False): + def jvp_test(): + nonlocal v + xs = self.gen_inputs(inputs) + if v is not None: + v = self.gen_inputs(v) + outputs, outputs_grad = jvp(func, + xs, + v, + create_graph=create_graph, + allow_unused=allow_unused) + else: + outputs, outputs_grad = jvp(func, + xs, + create_graph=create_graph, + allow_unused=allow_unused) + return outputs, outputs_grad + + return jvp_test + + def check_results(self, ref, res): + type_error = 'Result is different than expected in shape or type' + value_error = 'Result is different than expected values' + if ref is None: + self.assertTrue(res is None, type_error) + elif isinstance(ref, paddle.Tensor): + self.assertTrue(isinstance(res, paddle.Tensor), type_error) + self.assertTrue(paddle.allclose(res, ref), value_error) + else: + self.assertTrue(len(res) == len(ref), type_error) + for i in range(len(ref)): + self.check_results(ref[i], res[i]) + return True + + +class TestVJP(TestAutogradFunctional): + def test_vjp_i1o1_no_create_graph(self): + test_cases = [ + [reduce, 'A'], #noqa + [reduce_dim, 'A'], #noqa + ] #noqa + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_i2o1_no_create_graph(self): + test_cases = [ + [matmul, ['A', 'B']], #noqa + [mul, ['b', 'c']], #noqa + ] #noqa + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_i2o2_no_create_graph(self): + test_cases = [ + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + v = make_v(f, inputs) + vjp, grad = self.gen_test_pairs(f, inputs, v=v) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_nested_no_create_graph(self): + x = self.gen_input('a') + test_cases = [ + [nested(x), 'a'], #noqa + ] + for f, inputs in test_cases: + vjp, grad = self.gen_test_pairs(f, inputs) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + def test_vjp_aliased_input_no_create_graph(self): + x = self.gen_input('a') + ref = self.gen_test_pairs(nested(x), 'a')[0] + aliased = self.gen_test_pairs(nested(x), x)[0] + ref_result, aliased_result = ref(), aliased() + self.check_results(ref_result, aliased_result) + + def test_vjp_allowunused_no_create_graph(self): + x, y = self.gen_input('A'), self.gen_input('a') + vjp, grad = self.gen_test_pairs(unuse, [x, y], allow_unused=True) + vjp_result, grad_result = vjp(), grad() + self.check_results(grad_result, vjp_result) + + +def jac(grad_fn, f, inputs): + assert grad_fn in [vjp, jvp] + if grad_fn is jvp: + vs = [zeros_like(x) for x in inputs] + else: + outputs = f(*inputs) + if isinstance(outputs, paddle.Tensor): + outputs = [outputs] + vs = [zeros_like(y) for y in outputs] + JJ_cols = [] + for i, v in enumerate(vs): + v = v.flatten() + for j in range(len(v)): + _v = zeros_like(v).detach() + _v[j] = 1.0 + _v = _v.reshape(vs[i].shape) + _vs = vs.copy() + _vs[i] = _v + _, grads = grad_fn(f, inputs, vs) + d_outs = paddle.concat([d_out.flatten() for d_out in grads]) + JJ_cols.append(d_outs) + # JJ is the fully unrolled jacobian + JJ = paddle.stack(JJ_cols) + if grad_fn is vjp: + JJ = JJ.t() + return JJ + + +class TestJVP(TestAutogradFunctional): + def test_jvp_i1o1_no_create_graph(self): + test_cases = [ + [reduce, 'A'], #noqa + [reduce_dim, 'A'], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + def test_jvp_i2o1_no_create_graph(self): + test_cases = [ #noqa + [matmul, ['A', 'B']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + def test_jvp_i2o2_no_create_graph(self): + test_cases = [ #noqa + [o2, ['A', 'A']], #noqa + ] #noqa + for f, inputs in test_cases: + inputs = self.gen_inputs(inputs) + forward_jac = jac(jvp, f, inputs) + reverse_jac = jac(vjp, f, inputs) + self.check_results(forward_jac, reverse_jac) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/autograd/utils.py b/python/paddle/fluid/tests/unittests/autograd/utils.py new file mode 100644 index 0000000000000..0aadef4a809f3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/autograd/utils.py @@ -0,0 +1,107 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +from paddle.autograd.functional import _check_tensors + + +def _product(t): + if isinstance(t, int): + return t + else: + return np.product(t) + + +def _get_item(t, idx): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + return flat_t.__getitem__(idx) + + +def _set_item(t, idx, value): + assert isinstance(t, paddle.Tensor), "The first argument t must be Tensor." + assert isinstance(idx, + int), "The second argument idx must be an int number." + flat_t = paddle.reshape(t, [-1]) + flat_t.__setitem__(idx, value) + return paddle.reshape(flat_t, t.shape) + + +def _compute_numerical_jacobian(func, xs, delta, np_dtype): + xs = _check_tensors(xs, "xs") + ys = _check_tensors(func(*xs), "ys") + fin_size = len(xs) + fout_size = len(ys) + jacobian = list([] for _ in range(fout_size)) + for i in range(fout_size): + jac_i = list([] for _ in range(fin_size)) + for j in range(fin_size): + jac_i[j] = np.zeros( + (_product(ys[i].shape), _product(xs[j].shape)), dtype=np_dtype) + jacobian[i] = jac_i + + for j in range(fin_size): + for q in range(_product(xs[j].shape)): + orig = _get_item(xs[j], q) + x_pos = orig + delta + xs[j] = _set_item(xs[j], q, x_pos) + ys_pos = _check_tensors(func(*xs), "ys_pos") + + x_neg = orig - delta + xs[j] = _set_item(xs[j], q, x_neg) + ys_neg = _check_tensors(func(*xs), "ys_neg") + + xs[j] = _set_item(xs[j], q, orig) + + for i in range(fout_size): + for p in range(_product(ys[i].shape)): + y_pos = _get_item(ys_pos[i], p) + y_neg = _get_item(ys_neg[i], p) + jacobian[i][j][p][q] = (y_pos - y_neg) / delta / 2. + return jacobian + + +def _compute_numerical_hessian(func, xs, delta, np_dtype): + xs = _check_tensors(xs, "xs") + ys = _check_tensors(func(*xs), "ys") + fin_size = len(xs) + hessian = list([] for _ in range(fin_size)) + for i in range(fin_size): + hessian_i = list([] for _ in range(fin_size)) + for j in range(fin_size): + hessian_i[j] = np.zeros( + (_product(xs[i].shape), _product(xs[j].shape)), dtype=np_dtype) + hessian[i] = hessian_i + + for i in range(fin_size): + for p in range(_product(xs[i].shape)): + for j in range(fin_size): + for q in range(_product(xs[j].shape)): + orig = _get_item(xs[j], q) + x_pos = orig + delta + xs[j] = _set_item(xs[j], q, x_pos) + jacobian_pos = _compute_numerical_jacobian(func, xs, delta, + np_dtype) + x_neg = orig - delta + xs[j] = _set_item(xs[j], q, x_neg) + jacobian_neg = _compute_numerical_jacobian(func, xs, delta, + np_dtype) + xs[j] = _set_item(xs[j], q, orig) + hessian[i][j][p][q] = ( + jacobian_pos[0][i][0][p] - jacobian_neg[0][i][0][p] + ) / delta / 2. + return hessian diff --git a/python/paddle/fluid/tests/unittests/dist_transformer.py b/python/paddle/fluid/tests/unittests/dist_transformer.py index 6546bb5549df8..db321f9417880 100644 --- a/python/paddle/fluid/tests/unittests/dist_transformer.py +++ b/python/paddle/fluid/tests/unittests/dist_transformer.py @@ -1450,7 +1450,7 @@ def wrap_decoder(trg_vocab_size, # This is used to implement independent decoder program in inference. trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias, \ enc_output = make_all_inputs( - decoder_data_input_fields + decoder_util_input_fields) + decoder_data_input_fields) else: trg_word, trg_pos, trg_slf_attn_bias, trg_src_attn_bias = dec_inputs diff --git a/python/paddle/fluid/tests/unittests/fft/test_fft.py b/python/paddle/fluid/tests/unittests/fft/test_fft.py index 26355e0411fa3..c83c943217d4e 100644 --- a/python/paddle/fluid/tests/unittests/fft/test_fft.py +++ b/python/paddle/fluid/tests/unittests/fft/test_fft.py @@ -108,6 +108,8 @@ def decorate(cls): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestFft(unittest.TestCase): def test_fft(self): + """Test fft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -127,7 +129,14 @@ def test_fft(self): ('test_norm_not_in_enum_value', rand_x(2), None, -1, 'random', ValueError) ]) class TestFftException(unittest.TestCase): - def test_Fft(self): + def test_fft(self): + """Test fft with buoudary condition + Test case include: + - n out of range + - axis out of range + - axis type error + - norm out of range + """ with self.assertRaises(self.expect_exception): paddle.fft.fft( paddle.to_tensor(self.x), self.n, self.axis, self.norm) @@ -149,7 +158,9 @@ def test_Fft(self): ('test_norm_ortho', rand_x(5), None, (0, 1), 'ortho'), ]) class TestFft2(unittest.TestCase): - def test_Fft2(self): + def test_fft2(self): + """Test fft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -178,6 +189,15 @@ def test_Fft2(self): ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestFft2Exception(unittest.TestCase): def test_fft2(self): + """Test fft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis out of range + - axis type error + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.fft2( @@ -198,7 +218,9 @@ def test_fft2(self): 'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'), ('test_norm_ortho', rand_x(5), None, None, 'ortho')]) class TestFftn(unittest.TestCase): - def test_Fftn(self): + def test_fftn(self): + """Test fftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftn(self.x, self.n, self.axis, self.norm), @@ -230,10 +252,9 @@ def test_Fftn(self): "ortho"), ]) class TestHfft(unittest.TestCase): - """Test hfft with norm condition - """ - def test_hfft(self): + """Test hfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfft(self.x, self.n, self.axis, self.norm), @@ -265,10 +286,9 @@ def test_hfft(self): "ortho"), ]) class TestIrfft(unittest.TestCase): - """Test irfft with norm condition - """ - def test_irfft(self): + """Test irfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfft(self.x, self.n, self.axis, self.norm), @@ -299,11 +319,10 @@ def test_irfft(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None, "ortho"), ]) -class Testirfftn(unittest.TestCase): - """Test irfftn with norm condition - """ - +class TestIrfftn(unittest.TestCase): def test_irfftn(self): + """Test irfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfftn(self.x, self.n, self.axis, self.norm), @@ -334,11 +353,10 @@ def test_irfftn(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, None, "ortho"), ]) -class Testhfftn(unittest.TestCase): - """Test hfftn with norm condition - """ - +class TestHfftn(unittest.TestCase): def test_hfftn(self): + """Test hfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfftn(self.x, self.n, self.axis, self.norm), @@ -365,11 +383,10 @@ def test_hfftn(self): np.random.randn(4, 4, 4) + 1j * np.random.randn(4, 4, 4), None, (-2, -1), "ortho"), ]) -class Testhfft2(unittest.TestCase): - """Test hfft2 with norm condition - """ - +class TestHfft2(unittest.TestCase): def test_hfft2(self): + """Test hfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.hfft2(self.x, self.s, self.axis, self.norm), @@ -398,10 +415,9 @@ def test_hfft2(self): "ortho"), ]) class TestIrfft2(unittest.TestCase): - """Test irfft2 with norm condition - """ - def test_irfft2(self): + """Test irfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.irfft2(self.x, self.s, self.axis, self.norm), @@ -434,14 +450,16 @@ def test_irfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, -1, 'random', ValueError)]) class TestHfftException(unittest.TestCase): - '''Test hfft with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - ''' - def test_hfft(self): + """Test hfft with buoudary condition + Test case include: + Test case include: + - n out of range + - n type error + - axis out of range + - axis type error + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfft( @@ -466,15 +484,16 @@ def test_hfft(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfftException(unittest.TestCase): - '''Test Irfft with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfft(self): + """ + Test irfft with buoudary condition + Test case include: + - n out of range + - n type error + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfft( @@ -505,15 +524,17 @@ def test_irfft(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestHfft2Exception(unittest.TestCase): - '''Test hfft2 with buoudary condition - Test case include: - - n out of range - - axis out of range - - the dimensions of n and axis are different - - norm out of range - ''' - def test_hfft2(self): + """ + Test hfft2 with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfft2( @@ -544,15 +565,17 @@ def test_hfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfft2Exception(unittest.TestCase): - '''Test irfft2 with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfft2(self): + """ + Test irfft2 with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfft2( @@ -584,15 +607,16 @@ def test_irfft2(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestHfftnException(unittest.TestCase): - '''Test hfftn with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_hfftn(self): + """Test hfftn with buoudary condition + Test case include: + - input type error + - n type error + - n out of range + - axis out of range + - the dimensions of n and axis are different + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.hfftn( @@ -620,15 +644,15 @@ def test_hfftn(self): np.random.randn(4, 4) + 1j * np.random.randn(4, 4), None, None, 'random', ValueError)]) class TestIrfftnException(unittest.TestCase): - '''Test irfftn with buoudary condition - Test case include: - - n out of range - - axis out of range - - norm out of range - - the dimensions of n and axis are different - ''' - def test_irfftn(self): + """Test irfftn with buoudary condition + Test case include: + - n out of range + - n type error + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.irfftn( @@ -648,6 +672,8 @@ def test_irfftn(self): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestRfft(unittest.TestCase): def test_rfft(self): + """Test rfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -668,6 +694,14 @@ def test_rfft(self): ]) class TestRfftException(unittest.TestCase): def test_rfft(self): + """Test rfft with buoudary condition + Test case include: + - n out of range + - axis out of range + - axis type error + - norm out of range + - the dimensions of n and axis are different + """ with self.assertRaises(self.expect_exception): paddle.fft.rfft( paddle.to_tensor(self.x), self.n, self.axis, self.norm) @@ -688,6 +722,8 @@ def test_rfft(self): ]) class TestRfft2(unittest.TestCase): def test_rfft2(self): + """Test rfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -715,7 +751,16 @@ def test_rfft2(self): ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError), ]) class TestRfft2Exception(unittest.TestCase): - def test_rfft(self): + def test_rfft2(self): + """Test rfft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.rfft2( @@ -736,6 +781,8 @@ def test_rfft(self): ]) class TestRfftn(unittest.TestCase): def test_rfftn(self): + """Test rfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -759,7 +806,14 @@ def test_rfftn(self): ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestRfftnException(unittest.TestCase): - def test_rfft(self): + def test_rfftn(self): + """Test rfftn with buoudary condition + Test case include: + - n out of range + - axis out of range + - norm out of range + - the dimensions of n and axis are different + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.rfftn( @@ -779,6 +833,8 @@ def test_rfft(self): ('test_norm_ortho', rand_x(5), None, 3, 'ortho')]) class TestIhfft(unittest.TestCase): def test_ihfft(self): + """Test ihfft with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ihfft(self.x, self.n, self.axis, self.norm), @@ -798,6 +854,12 @@ def test_ihfft(self): ]) class TestIhfftException(unittest.TestCase): def test_ihfft(self): + """Test ihfft with buoudary condition + Test case include: + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfft( @@ -819,6 +881,8 @@ def test_ihfft(self): ]) class TestIhfft2(unittest.TestCase): def test_ihfft2(self): + """Test ihfft2 with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ihfft2(self.x, self.n, self.axis, self.norm), @@ -844,7 +908,16 @@ def test_ihfft2(self): -10, 'backward', ValueError), ('test_norm_not_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestIhfft2Exception(unittest.TestCase): - def test_rfft(self): + def test_ihfft2(self): + """Test ihfft2 with buoudary condition + Test case include: + - input type error + - input dim error + - n out of range + - axis type error + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfft2( @@ -863,7 +936,9 @@ def test_rfft(self): 'backward'), ('test_norm_forward', rand_x(5), None, None, 'forward'), ('test_norm_ortho', rand_x(5), None, None, 'ortho')]) class TestIhfftn(unittest.TestCase): - def test_rfftn(self): + def test_ihfftn(self): + """Test ihfftn with norm condition + """ with paddle.fluid.dygraph.guard(self.place): self.assertTrue( np.allclose( @@ -885,7 +960,14 @@ def test_rfftn(self): ValueError), ('test_norm_not_in_enum', rand_x(2), None, -1, 'random', ValueError)]) class TestIhfftnException(unittest.TestCase): - def test_rfft(self): + def test_ihfftn(self): + """Test ihfftn with buoudary condition + Test case include: + - input type error + - n out of range + - axis out of range + - norm out of range + """ with paddle.fluid.dygraph.guard(self.place): with self.assertRaises(self.expect_exception): paddle.fft.ihfftn( @@ -899,6 +981,8 @@ def test_rfft(self): ]) class TestFftFreq(unittest.TestCase): def test_fftfreq(self): + """Test fftfreq with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftfreq(self.n, self.d).astype(self.dtype), @@ -914,6 +998,8 @@ def test_fftfreq(self): ]) class TestRfftFreq(unittest.TestCase): def test_rfftfreq(self): + """Test rfftfreq with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.rfftfreq(self.n, self.d).astype(self.dtype), @@ -929,6 +1015,8 @@ def test_rfftfreq(self): ]) class TestFftShift(unittest.TestCase): def test_fftshift(self): + """Test fftshift with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.fftshift(self.x, self.axes), @@ -945,6 +1033,8 @@ def test_fftshift(self): ]) class TestIfftShift(unittest.TestCase): def test_ifftshift(self): + """Test ifftshift with norm condition + """ with paddle.fluid.dygraph.guard(self.place): np.testing.assert_allclose( scipy.fft.ifftshift(self.x, self.axes), diff --git a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py index 1535fac499ec6..6b49049073948 100644 --- a/python/paddle/fluid/tests/unittests/hdfs_test_utils.py +++ b/python/paddle/fluid/tests/unittests/hdfs_test_utils.py @@ -245,6 +245,15 @@ def _test_touch(self, fs): self.assertFalse(fs.is_dir(path)) fs.delete(path) + def _test_list_files_info(self, fs): + path = [] + fs.list_files_info(path) + path = ["./list_files_info.flag"] + fs.list_files_info(path) + fs.touch(path, exist_ok=True) + fs.list_files_info(path) + fs.delete(path) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py index 912849ffbeb71..71e873b0e2f7c 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_alexnet.py @@ -53,6 +53,13 @@ def setUp(self): } fleet.init(is_collective=True, strategy=strategy) + def build_optimizer(self, model): + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + parameters=model.parameters()) + return scheduler, optimizer + def test_pp_model(self): hcg = fleet.get_hybrid_communicate_group() word_size = hcg.get_model_parallel_world_size() @@ -63,10 +70,7 @@ def test_pp_model(self): #construct model a model_a = AlexNet(10) - scheduler_a = paddle.optimizer.lr.PiecewiseDecay( - boundaries=[2], values=[0.001, 0.002], verbose=True) - optimizer_a = paddle.optimizer.SGD(learning_rate=scheduler_a, - parameters=model_a.parameters()) + scheduler_a, optimizer_a = self.build_optimizer(model_a) param_len = len(model_a.parameters()) @@ -76,10 +80,7 @@ def test_pp_model(self): # construct model b model_b = AlexNetPipeDesc(num_stages=self.pipeline_parallel_size) - scheduler_b = paddle.optimizer.lr.PiecewiseDecay( - boundaries=[2], values=[0.001, 0.002], verbose=True) - optimizer_b = paddle.optimizer.SGD(learning_rate=scheduler_b, - parameters=model_b.parameters()) + scheduler_b, optimizer_b = self.build_optimizer(model_b) model_b = fleet.distributed_model(model_b) optimizer_b = fleet.distributed_optimizer(optimizer_b) diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py new file mode 100644 index 0000000000000..de980f3c3f787 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_clip_grad.py @@ -0,0 +1,35 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import paddle +import unittest +from hybrid_parallel_pp_alexnet import TestDistPPTraning + + +class TestPPClipGrad(TestDistPPTraning): + def build_optimizer(self, model): + grad_clip = paddle.nn.ClipGradByGlobalNorm(0.5) + scheduler = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2], values=[0.001, 0.002], verbose=True) + optimizer = paddle.optimizer.SGD(learning_rate=scheduler, + grad_clip=grad_clip, + parameters=model.parameters()) + return scheduler, optimizer + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py index 2995e4dbf8401..8cb1166cd0d83 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_sharding_model.py @@ -183,21 +183,23 @@ def build_optimizer(self, strategy=None, is_sharding=True, Optimizer="adam"): - + clip = paddle.nn.ClipGradByGlobalNorm(0.5) if Optimizer == "adam": if is_sharding: optimizer = DygraphShardingOptimizer( hcg=fleet.get_hybrid_communicate_group(), user_defined_strategy=strategy, params=model.parameters(), - inner_optimizer_class=paddle.optimizer.Adam, + inner_optimizer_class=paddle.optimizer.AdamW, learning_rate=0.001, - weight_decay=0.00001, ) + weight_decay=0.00001, + grad_clip=clip) else: - optimizer = paddle.optimizer.Adam( + optimizer = paddle.optimizer.AdamW( parameters=model.parameters(), learning_rate=0.001, - weight_decay=0.00001, ) + weight_decay=0.00001, + grad_clip=clip) else: if is_sharding: optimizer = DygraphShardingOptimizer( @@ -205,10 +207,13 @@ def build_optimizer(self, user_defined_strategy=strategy, params=model.parameters(), inner_optimizer_class=paddle.optimizer.Momentum, - learning_rate=0.001, ) + learning_rate=0.001, + grad_clip=clip) else: optimizer = paddle.optimizer.Momentum( - learning_rate=0.001, parameters=model.parameters()) + learning_rate=0.001, + parameters=model.parameters(), + grad_clip=clip) return optimizer def build_model_optimizer(self, Optimizer="adam"): diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py index 8e196f5081f73..62825caf5185c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_activation_pass.py @@ -139,6 +139,42 @@ def append_act(self, x): return fluid.layers.swish(x) +class TensorRTSubgraphPassMishTest(TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Float32, True, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + +class TensorRTSubgraphPassMishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, True, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + +class TensorRTSubgraphPassDynamicMishFp16SerializeTest( + TensorRTSubgraphPassActivationTest): + def setUpTensorRTParam(self): + self.enable_trt = True + self.trt_parameters = TensorRTSubgraphPassActivationTest.TensorRTParam( + 1 << 30, 32, 0, AnalysisConfig.Precision.Half, False, False) + self.dynamic_shape_params = TensorRTSubgraphPassActivationTest.DynamicShapeParam( + { + 'data': [1, 6, 8, 8] + }, {'data': [1, 6, 512, 512]}, {'data': [1, 6, 256, 256]}, False) + + def append_act(self, x): + return fluid.layers.mish(x) + + class TensorRTSubgraphPassPreluAllTest(TensorRTSubgraphPassActivationTest): def append_act(self, x): return fluid.layers.prelu(x, mode='all') diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py index 82dd492b5275f..2c8f2592a737c 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py @@ -173,7 +173,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + attrs, False), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -185,7 +185,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): True), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, True), (1e-5, 1e-5) + attrs, True), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, True), (1e-5, 1e-5) @@ -214,6 +214,16 @@ def teller2(program_config, predictor_config): "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." ) + def teller3(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py index e6b3aa30bf896..fc2358bb11636 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d.py @@ -165,7 +165,6 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), (1e-5, 1e-5) # for dynamic_shape - generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num(attrs, @@ -190,6 +189,16 @@ def teller1(program_config, predictor_config): "When padding_algorithm is 'SAME' or 'VALID', Trt dose not support. In this case, trt build error is caused by scale op." ) + def teller2(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py index 473925c6cdb79..2fcd2bf5aca97 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_depthwise_conv2d_transpose.py @@ -137,7 +137,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape): attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), (1e-5, 1e-5) + attrs, False), (1e-5, 1e-3) self.trt_param.precision = paddle_infer.PrecisionType.Int8 yield self.create_inference_config(), generate_trt_nodes_num( attrs, False), (1e-5, 1e-5) @@ -178,6 +178,16 @@ def teller2(program_config, predictor_config): "When dilations's element is not equal 1, there are different behaviors between Trt and Paddle." ) + def teller3(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Int8: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "When precisionType is int8 without relu op, output is different between Trt and Paddle." + ) + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py index 2d18738b614cb..c8cba0f372380 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py @@ -32,8 +32,8 @@ def generate_weight(): return np.random.randn(32).astype(np.float32) for batch in [1, 2, 4]: - for shape in [[32], [batch, 32], [batch, 64, 32], - [batch, 8, 16, 32]]: + for shape in [[32], [batch, 32], [batch, 32, 32], + [batch, 32, 16, 32]]: for op_type in ["elementwise_add", "elementwise_mul"]: for axis in [len(shape) - 1, -1]: self.dims = len(shape) @@ -68,26 +68,27 @@ def generate_weight(): def sample_predictor_configs( self, program_config) -> (paddle_infer.Config, List[int], float): def generate_dynamic_shape(attrs): + # The input.dims[1] must be equal to the weight's length. if self.dims == 1: self.dynamic_shape.min_input_shape = {"input_data": [4]} self.dynamic_shape.max_input_shape = {"input_data": [256]} self.dynamic_shape.opt_input_shape = {"input_data": [16]} elif self.dims == 2: - self.dynamic_shape.min_input_shape = {"input_data": [1, 4]} - self.dynamic_shape.max_input_shape = {"input_data": [4, 256]} - self.dynamic_shape.opt_input_shape = {"input_data": [2, 16]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 32]} + self.dynamic_shape.max_input_shape = {"input_data": [4, 32]} + self.dynamic_shape.opt_input_shape = {"input_data": [2, 32]} elif self.dims == 3: - self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4]} + self.dynamic_shape.min_input_shape = {"input_data": [1, 32, 4]} self.dynamic_shape.max_input_shape = { - "input_data": [4, 256, 256] + "input_data": [4, 32, 256] } self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 16]} elif self.dims == 4: self.dynamic_shape.min_input_shape = { - "input_data": [1, 4, 4, 4] + "input_data": [1, 32, 4, 4] } self.dynamic_shape.max_input_shape = { - "input_data": [4, 256, 128, 256] + "input_data": [4, 32, 128, 256] } self.dynamic_shape.opt_input_shape = { "input_data": [2, 32, 32, 16] @@ -98,6 +99,11 @@ def clear_dynamic_shape(): self.dynamic_shape.min_input_shape = {} self.dynamic_shape.opt_input_shape = {} + def generate_trt_nodes_num(attrs, dynamic_shape): + if self.dims == 1: + return 0, 3 + return 1, 2 + attrs = [ program_config.ops[i].attrs for i in range(len(program_config.ops)) @@ -106,18 +112,52 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (0, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (0, 3), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 2), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 2), 1e-5 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dims == 2 and len(self.dynamic_shape.max_input_shape) == 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + + def teller2(program_config, predictor_config): + if self.dims == 3: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and tensorrt when input dim is 3.") + + def teller3(program_config, predictor_config): + if self.dims == 4: + return True + return False + + self.add_skip_case( + teller3, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and tensorrt when input dim is 4.") def test(self): + self.add_skip_trt_case() self.run_test() @@ -245,15 +285,26 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 3), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dims == 2: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + def test(self): + self.add_skip_trt_case() self.run_test() class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest): def is_program_valid(self, program_config: ProgramConfig) -> bool: inputs = program_config.inputs - if len(inputs['input_data1'].shape) == 1 or len(inputs['input_data2'] - .shape) == 1: + if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape): return False return True @@ -264,24 +315,27 @@ def generate_input(shape): input1_shape_list = [[4, 32], [2, 4, 32], [4, 2, 4, 32]] input2_shape1_list = [[32], [4, 32], [2, 4, 32]] - input2_shape2_list = [[1, 32], [1, 1, 32], [1, 1, 1, 32]] - input2_shape3_list = [[1, 32], [1, 4, 32], [4, 32]] + input2_shape2_list = [[4, 1], [2, 4, 1], [4, 2, 4, 1]] + input2_shape3_list = [[32], [2, 1, 1], [4, 2, 1, 1]] + input2_shape4_list = [[32], [4, 32], [4, 1, 1, 1]] input2_shape_list = [ - input2_shape1_list, input2_shape2_list, input2_shape3_list + input2_shape1_list, input2_shape2_list, input2_shape3_list, + input2_shape4_list ] axis1_list = [[-1], [1, -1], [1, -1]] - axis2_list = [[-1], [-1], [-1]] - axis3_list = [[-1], [-1], [2, -1]] - axis_list = [axis1_list, axis2_list, axis3_list] + axis2_list = [[-1], [0], [0]] + axis3_list = [[-1], [0], [0]] + axis4_list = [[-1], [-1], [0]] + axis_list = [axis1_list, axis2_list, axis3_list, axis4_list] for i in range(3): input1_shape = input1_shape_list[i] - for j in range(3): + for j in range(4): input2_shape = input2_shape_list[j][i] for op_type in ["elementwise_add", "elementwise_mul"]: for axis in axis_list[j][i]: - self.dims1 = len(input1_shape) - self.dims2 = len(input2_shape) + self.shape1 = input1_shape + self.shape2 = input2_shape dics = [{"axis": axis}] ops_config = [{ "op_type": op_type, @@ -318,16 +372,16 @@ def generate_dynamic_shape(attrs): opt_shape = [[32], [32, 32], [32, 32, 32], [32, 32, 32, 32]] self.dynamic_shape.min_input_shape = { - "input_data1": min_shape[self.dims1 - 1], - "input_data2": min_shape[self.dims2 - 1] + "input_data1": min_shape[len(self.shape1) - 1], + "input_data2": min_shape[len(self.shape2) - 1] } self.dynamic_shape.max_input_shape = { - "input_data1": max_shape[self.dims1 - 1], - "input_data2": max_shape[self.dims2 - 1] + "input_data1": max_shape[len(self.shape1) - 1], + "input_data2": max_shape[len(self.shape2) - 1] } self.dynamic_shape.opt_input_shape = { - "input_data1": opt_shape[self.dims1 - 1], - "input_data2": opt_shape[self.dims2 - 1] + "input_data1": opt_shape[len(self.shape1) - 1], + "input_data2": opt_shape[len(self.shape2) - 1] } def clear_dynamic_shape(): @@ -342,10 +396,11 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() - self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 3), 1e-5 - self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 3), 1e-5 + if self.shape1[0] == self.shape2[0]: + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), (1, 3), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), (1, 3), 1e-5 # for dynamic_shape generate_dynamic_shape(attrs) @@ -354,7 +409,19 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 3), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len(self.shape1) == 2: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output shape are not equal between gpu and tensorrt when input dim is 2." + ) + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py index f25a3b82476dc..d7b0bcd908085 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_emb_eltwise_layernorm.py @@ -252,7 +252,19 @@ def clear_dynamic_shape(): self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), (1, 4), 1e-5 + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Half and len( + self.dynamic_shape.min_input_shape) != 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt when dynamic fp16 mode.") + def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py index 0224f20ec747e..b6b5aa9dbfe95 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py @@ -114,19 +114,33 @@ def generate_trt_nodes_num(attrs, dynamic_shape): clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half yield self.create_inference_config(), generate_trt_nodes_num( - attrs, False), 1e-5 + attrs, False), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) - # self.trt_param.precision = paddle_infer.PrecisionType.Float32 - # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 - # self.trt_param.precision = paddle_infer.PrecisionType.Half - # yield self.create_inference_config(), generate_trt_nodes_num(attrs, True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if len(self.dynamic_shape.min_input_shape) != 0: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The goup_norm plugin will check dim not -1 failed when dynamic fp16 mode." + ) def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py new file mode 100644 index 0000000000000..d223fd529ab17 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_mish.py @@ -0,0 +1,174 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons +from program_config import TensorConfig, ProgramConfig +import numpy as np +import paddle.inference as paddle_infer +from functools import partial +from typing import Optional, List, Callable, Dict, Any, Set +import unittest + + +class TrtConvertMishTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def generate_input(batch, dim1, dim2, dim3): + shape = [batch] + if dim1 != 0: + shape.append(dim1) + if dim2 != 0: + shape.append(dim2) + if dim3 != 0: + shape.append(dim3) + return np.random.random(shape).astype(np.float32) + + for batch in [1, 4]: + for dim1 in [0, 3]: + for dim2 in [0, 16]: + for dim3 in [0, 32]: + for thre in [5.0, 20.0]: + self.dim1 = dim1 + self.dim2 = dim2 + self.dim3 = dim3 + + if dim1 == 0 and dim2 != 0: + continue + if dim1 == 0 and dim2 == 0 and dim3 != 0: + continue + + ops_config = [{ + "op_type": "mish", + "op_inputs": { + "X": ["input_data"] + }, + "op_outputs": { + "Out": ["mish_output_data"] + }, + "op_attrs": { + "threshold": thre + } + }] + + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={}, + inputs={ + "input_data": TensorConfig( + data_gen=partial(generate_input, batch, + dim1, dim2, dim3)) + }, + outputs=["mish_output_data"]) + + yield program_config + + def sample_predictor_configs(self, program_config): + def generate_dynamic_shape(attrs): + if self.dim1 == 0: + self.dynamic_shape.min_input_shape = {"input_data": [1], } + self.dynamic_shape.max_input_shape = {"input_data": [4], } + self.dynamic_shape.opt_input_shape = {"input_data": [2], } + else: + if self.dim2 == 0 and self.dim3 == 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3], + } + elif self.dim2 != 0 and self.dim3 != 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1, 1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64, 128, 128], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3, 16, 32], + } + elif self.dim3 == 0: + self.dynamic_shape.min_input_shape = { + "input_data": [1, 1, 1], + } + self.dynamic_shape.max_input_shape = { + "input_data": [4, 64, 256], + } + self.dynamic_shape.opt_input_shape = { + "input_data": [2, 3, 128], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 2 + + attrs = [ + program_config.ops[i].attrs + for i in range(len(program_config.ops)) + ] + + # for static_shape + clear_dynamic_shape() + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, False), 1e-5 + + # for dynamic_shape + generate_dynamic_shape(attrs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num(attrs, + True), 1e-5 + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.dim1 == 0 and self.dim2 == 0 and self.dim3 == 0: + return True + return False + + self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT, + "Trt does not support 1-dimensional input.") + + def teller2(program_config, predictor_config): + if (len(self.dynamic_shape.min_input_shape) == 0): + if self.dim1 != 0 and self.dim2 == 0 and self.dim3 == 0: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_SUPPORT, + "Need to repair the case: the output of GPU and tensorrt has diff when the input dimension is 2 in static shape mode." + ) + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py index e772df522b5c5..0b98ab53fcc29 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py @@ -38,6 +38,7 @@ def generate_weight2(): return np.random.randn(768).astype(np.float32) for batch in [1, 2, 4]: + self.batch = batch for reshape_shape in [[0, 0, 12, 64]]: for dim1 in [128]: input2_shapes = [[batch, reshape_shape[2], dim1, dim1], @@ -417,18 +418,40 @@ def clear_dynamic_shape(): # for static_shape clear_dynamic_shape() self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 4), 1e-5 + yield self.create_inference_config(), (1, 4), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 4), 1e-5 + yield self.create_inference_config(), (1, 4), (1e-5, 1e-5) # for dynamic_shape generate_dynamic_shape(attrs) self.trt_param.precision = paddle_infer.PrecisionType.Float32 - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), (1, 3), (1e-5, 1e-5) self.trt_param.precision = paddle_infer.PrecisionType.Half - yield self.create_inference_config(), (1, 3), 1e-5 + yield self.create_inference_config(), (1, 3), (1e-5, 1e-5) + + def add_skip_trt_case(self): + def teller1(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Half: + return True + return False + + self.add_skip_case( + teller1, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt in fp16 mode.") + + def teller2(program_config, predictor_config): + if self.trt_param.precision == paddle_infer.PrecisionType.Float32 and len( + self.dynamic_shape.min_input_shape) != 0 and self.batch > 2: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2." + ) def test(self): + self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py index 3e923b1bd89d6..9ec2f83fa5ba0 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py @@ -21,9 +21,22 @@ class TrtConvertPool2dTest(TrtLayerAutoScanTest): - def is_program_valid(self, program_config: ProgramConfig) -> bool: + def is_paddings_valid(self, program_config: ProgramConfig) -> bool: + exclusive = program_config.ops[0].attrs['exclusive'] + paddings = program_config.ops[0].attrs['paddings'] + ksize = program_config.ops[0].attrs['ksize'] + pooling_type = program_config.ops[0].attrs['pooling_type'] + global_pooling = program_config.ops[0].attrs['global_pooling'] + if global_pooling == False: + if pooling_type == 'avg': + for index in range(len(ksize)): + if ksize[index] <= paddings[index]: + return False return True + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return self.is_paddings_valid(program_config) + def sample_program_configs(self): self.trt_param.workspace_size = 1073741824 @@ -34,7 +47,7 @@ def generate_weight1(attrs: List[Dict[str, Any]]): return np.random.random([24, 3, 3, 3]).astype(np.float32) for strides in [[1, 1], [2, 2], [1, 2]]: - for paddings in [[0, 2], [0, 3], [1, 2, 3, 4]]: + for paddings in [[0, 2], [0, 3], [0, 1, 2, 3]]: for pooling_type in ['max', 'avg']: for padding_algotithm in ['EXPLICIT', 'SAME', 'VAILD']: for ksize in [[2, 3], [3, 3]]: @@ -43,7 +56,6 @@ def generate_weight1(attrs: List[Dict[str, Any]]): for exclusive in [True, False]: for adaptive in [True, False]: for ceil_mode in [True, False]: - self.paddings = paddings dics = [{ "pooling_type": @@ -102,9 +114,6 @@ def clear_dynamic_shape(): self.dynamic_shape.opt_input_shape = {} def generate_trt_nodes_num(attrs, dynamic_shape): - if self.paddings == [0, 3] or attrs[0][ - 'global_pooling'] == True or attrs[0]['ceil_mode'] == True: - return 0, 3 return 1, 2 attrs = [ @@ -139,6 +148,15 @@ def teller1(program_config, predictor_config): self.add_skip_case(teller1, SkipReasons.TRT_NOT_IMPLEMENTED, "4-dims paddings are not support for trt now.") + def teller2(program_config, predictor_config): + if program_config.ops[0].attrs['global_pooling'] == True: + return True + return False + + self.add_skip_case( + teller2, SkipReasons.TRT_NOT_IMPLEMENTED, + "It is not support that global_pooling is true for trt now.") + def test(self): self.add_skip_trt_case() self.run_test() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py index 2166bbaa98b2f..b0124f055b4e1 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_yolo_box_op.py @@ -116,5 +116,56 @@ def test_check_output(self): PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) +class TRTYoloBoxIoUAwareTest(InferencePassTest): + def setUp(self): + self.set_params() + with fluid.program_guard(self.main_program, self.startup_program): + image_shape = [self.bs, self.channel, self.height, self.width] + image = fluid.data(name='image', shape=image_shape, dtype='float32') + image_size = fluid.data( + name='image_size', shape=[self.bs, 2], dtype='int32') + boxes, scores = self.append_yolobox(image, image_size) + + self.feeds = { + 'image': np.random.random(image_shape).astype('float32'), + 'image_size': np.random.randint( + 32, 64, size=(self.bs, 2)).astype('int32'), + } + self.enable_trt = True + self.trt_parameters = TRTYoloBoxTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [scores, boxes] + + def set_params(self): + self.bs = 4 + self.channel = 258 + self.height = 64 + self.width = 64 + self.class_num = 80 + self.anchors = [10, 13, 16, 30, 33, 23] + self.conf_thresh = .1 + self.downsample_ratio = 32 + self.iou_aware = True + self.iou_aware_factor = 0.5 + + def append_yolobox(self, image, image_size): + return fluid.layers.yolo_box( + x=image, + img_size=image_size, + class_num=self.class_num, + anchors=self.anchors, + conf_thresh=self.conf_thresh, + downsample_ratio=self.downsample_ratio, + iou_aware=self.iou_aware, + iou_aware_factor=self.iou_aware_factor) + + def test_check_output(self): + if core.is_compiled_with_cuda(): + use_gpu = True + self.check_output_with_option(use_gpu, flatten=True) + self.assertTrue( + PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')) + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py index c8b9d5e5739dd..851ae21c38378 100644 --- a/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_generate_pass.py @@ -123,6 +123,9 @@ def convert_ops_to_op_dicts(self, ops): op_dicts[op.type] = [op] return op_dicts + def test_has_attr(self): + self.assertFalse(hasattr(ir.PassDesc.OP, '__name__')) + def test_generate_fc_fuse(self): def _check_fc_fuse_pass(pass_desc, with_relu): pattern_op_dicts = self.convert_ops_to_op_dicts( diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py new file mode 100644 index 0000000000000..49ca89a35f4ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/test_ir_subgraph_python_interface.py @@ -0,0 +1,96 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import paddle.fluid as fluid +import six + +from paddle.fluid.framework import IrGraph +from paddle.fluid.framework import IrNode +from paddle.fluid.tests.unittests.op_test import OpTestTool +from paddle.fluid import core +import paddle.fluid.layers as layers +from paddle.fluid.framework import Program, program_guard, default_startup_program +from paddle.fluid.contrib.slim.quantization import QuantizationTransformPass + +paddle.enable_static() + + +class TestQuantizationSubGraph(unittest.TestCase): + def build_graph_with_sub_graph(self): + def linear_fc(num): + data = fluid.layers.data( + name='image', shape=[1, 32, 32], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + hidden = data + for _ in six.moves.xrange(num): + hidden = fluid.layers.fc(hidden, size=128, act='relu') + loss = fluid.layers.cross_entropy(input=hidden, label=label) + loss = fluid.layers.mean(loss) + return loss + + main_program = Program() + startup_program = Program() + + def true_func(): + return linear_fc(3) + + def false_func(): + return linear_fc(5) + + with program_guard(main_program, startup_program): + x = layers.fill_constant(shape=[1], dtype='float32', value=0.1) + y = layers.fill_constant(shape=[1], dtype='float32', value=0.23) + pred = layers.less_than(y, x) + out = layers.cond(pred, true_func, false_func) + + core_graph = core.Graph(main_program.desc) + # We should create graph for test, otherwise it will throw a + # error that it cannot find the node of "STEP_COUNTER" + graph = IrGraph(core_graph, for_test=True) + sub_graph = graph.get_sub_graph(0) + all_sub_graphs = graph.all_sub_graphs( + for_test=True) # same reason for subgraph + # Should return graph and sub_graphs at the same time. If only return sub_graph, the graph will + # be destructed and the sub_graphs will be empty. + return graph, all_sub_graphs + + def test_quant_sub_graphs(self, use_cuda=False): + graph, sub_graphs = self.build_graph_with_sub_graph() + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + transform_pass = QuantizationTransformPass( + scope=fluid.global_scope(), + place=place, + activation_quantize_type='abs_max', + weight_quantize_type='range_abs_max') + Find_inserted_quant_op = False + for sub_graph in sub_graphs: + transform_pass.apply(sub_graph) + for op in sub_graph.all_op_nodes(): + if 'quantize' in op.name(): + Find_inserted_quant_op = True + self.assertTrue(Find_inserted_quant_op) + + def test_quant_sub_graphs_cpu(self): + self.test_quant_sub_graphs(use_cuda=False) + + @OpTestTool.skip_if(not paddle.is_compiled_with_cuda(), + "Not GPU version paddle") + def test_quant_sub_graphs_gpu(self): + self.test_quant_sub_graphs(use_cuda=True) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py index 3d5a013915833..cd9987b3c8e82 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py @@ -14,6 +14,8 @@ from __future__ import print_function +import six +import abc import unittest import numpy as np from scipy.special import expit, erf @@ -24,15 +26,19 @@ @OpTestTool.skip_if_not_cpu_bf16() -class TestMKLDNNSigmoidBF16Op(TestActivation): +@six.add_metaclass(abc.ABCMeta) +class MKLDNNBF16ActivationOp(object): + @abc.abstractmethod def config(self): - self.op_type = "sigmoid" + pass + @abc.abstractmethod def op_forward(self, x): - return 1 / (1 + np.exp(-x)) + pass + @abc.abstractmethod def op_grad(self, dout, x): - return dout * self.op_forward(x) * (1 - self.op_forward(x)) + pass def set_attrs(self): self.attrs = {"use_mkldnn": True} @@ -65,7 +71,18 @@ def test_check_grad(self): user_defined_grad_outputs=[convert_float_to_uint16(self.out)]) -class TestMKLDNNGeluErfBF16Op(TestMKLDNNSigmoidBF16Op): +class TestMKLDNNSigmoidBF16Op(MKLDNNBF16ActivationOp, TestActivation): + def config(self): + self.op_type = "sigmoid" + + def op_forward(self, x): + return 1 / (1 + np.exp(-x)) + + def op_grad(self, dout, x): + return dout * self.op_forward(x) * (1 - self.op_forward(x)) + + +class TestMKLDNNGeluErfBF16Op(MKLDNNBF16ActivationOp, TestActivation): def config(self): self.op_type = "gelu" @@ -83,7 +100,7 @@ def init_data(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) -class TestMKLDNNGeluTanhBF16Op(TestMKLDNNSigmoidBF16Op): +class TestMKLDNNGeluTanhBF16Op(MKLDNNBF16ActivationOp, TestActivation): def config(self): self.op_type = "gelu" @@ -104,3 +121,18 @@ def set_attrs(self): class TestMKLDNNGeluTanhDim2BF16Op(TestMKLDNNGeluTanhBF16Op): def init_data(self): self.x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32) + + +class TestMKLDNNReluBF16Op(MKLDNNBF16ActivationOp, TestActivation): + def config(self): + self.op_type = "relu" + + def op_forward(self, x): + return np.maximum(x, 0) + + def op_grad(self, dout, x): + return dout + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py index 2b7b2b36afa4f..e53afaa57be1c 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py @@ -40,13 +40,28 @@ def setUp(self): 'mkldnn_data_type': self.mkldnn_data_type } + self.sections = [self.x0.shape[self.axis]] * 2 + self.sections[1] += self.x1.shape[self.axis] + self.output = np.concatenate( (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16) self.outputs = {'Out': self.output} + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dxs = np.split(self.dout, self.sections, self.axis) + def test_check_output(self): self.check_output_with_place(core.CPUPlace()) + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["x0", "x1", "x2"], + "Out", + user_defined_grads=[self.dxs[0], self.dxs[1], self.dxs[2]], + user_defined_grad_outputs=[self.dout]) + # --------------------test concat bf16 in with axis 0-------------------- def init_test_data(self): @@ -61,9 +76,9 @@ def init_axis(self): self.axis = 0 def init_shape(self): - self.x0_shape = [2, 2, 1, 2] - self.x1_shape = [1, 2, 1, 2] - self.x2_shape = [3, 2, 1, 2] + self.x0_shape = [6, 2, 4, 3] + self.x1_shape = [7, 2, 4, 3] + self.x2_shape = [8, 2, 4, 3] # --------------------test concat bf16 in with axis 1-------------------- @@ -74,9 +89,9 @@ def init_axis(self): self.axis = 1 def init_shape(self): - self.x0_shape = [1, 1, 5, 5] - self.x1_shape = [1, 2, 5, 5] - self.x2_shape = [1, 3, 5, 5] + self.x0_shape = [1, 4, 5, 5] + self.x1_shape = [1, 8, 5, 5] + self.x2_shape = [1, 6, 5, 5] # --------------------test concat bf16 in with axis 2-------------------- diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py index 4900b42d3618d..7fc8f1d30802c 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_mkldnn_op.py @@ -15,78 +15,90 @@ from __future__ import print_function import unittest -from paddle.fluid.tests.unittests.test_concat_op import TestConcatOp, TestConcatOp2, TestConcatOp3, TestConcatOp4 +import numpy as np +import struct - -class TestMKLDNNConcatOp(TestConcatOp): - def setUp(self): - super(TestMKLDNNConcatOp, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True - - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) - - def test_check_grad(self): - pass - - def init_kernel_type(self): - self.use_mkldnn = True +import paddle.fluid.core as core +from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16 +from paddle import enable_static -class TestMKLDNNConcatOp2(TestConcatOp2): +class TestConcatAxis0OneDNNOp(OpTest): def setUp(self): - super(TestMKLDNNConcatOp2, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True + self.op_type = "concat" + self.mkldnn_data_type = "float32" + self.init_axis() + self.init_shape() + self.init_test_data() + self.configure_datatype() + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = { + 'axis': self.axis, + 'use_mkldnn': True, + 'mkldnn_data_type': self.mkldnn_data_type + } + + self.output = np.concatenate( + (self.x0, self.x1, self.x2), axis=self.axis).astype(self.dtype) + + self.outputs = {'Out': self.output} + + def configure_datatype(self): + self.mkldnn_data_type = "float32" + self.dtype = np.float32 def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) + self.check_output_with_place(core.CPUPlace()) def test_check_grad(self): - pass + self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') - def init_kernel_type(self): - self.use_mkldnn = True + def init_test_data(self): + self.x0 = np.random.random(self.x0_shape).astype(np.float32) + self.x1 = np.random.random(self.x1_shape).astype(np.float32) + self.x2 = np.random.random(self.x2_shape).astype(np.float32) + def init_axis(self): + self.axis = 0 -class TestMKLDNNConcatOp3(TestConcatOp3): - def setUp(self): - super(TestMKLDNNConcatOp3, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True + def init_shape(self): + self.x0_shape = [2, 2, 1, 50] + self.x1_shape = [1, 2, 1, 50] + self.x2_shape = [3, 2, 1, 50] - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) - def test_check_grad(self): - pass +class TestConcatAxis1OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 1 - def init_kernel_type(self): - self.use_mkldnn = True + def init_shape(self): + self.x0_shape = [1, 1, 5, 50] + self.x1_shape = [1, 2, 5, 50] + self.x2_shape = [1, 3, 5, 50] -class TestMKLDNNConcatOp4(TestConcatOp4): - def setUp(self): - super(TestMKLDNNConcatOp4, self).setUp() - self.attrs["use_mkldnn"] = True - self._cpu_only = True +class TestConcatAxis2OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 2 - def test_check_output(self): - # TODO(wangzhongpu): support mkldnn op in dygraph mode - self.check_output(check_dygraph=(self.attrs["use_mkldnn"] == False)) + def init_shape(self): + self.x0_shape = [2, 3, 4, 50] + self.x1_shape = [2, 3, 5, 50] + self.x2_shape = [2, 3, 6, 50] - def test_check_grad(self): - pass - def init_kernel_type(self): - self.use_mkldnn = True +class TestConcatAxis3OneDNNOp(TestConcatAxis0OneDNNOp): + def init_axis(self): + self.axis = 3 + + def init_shape(self): + self.x0_shape = [5, 3, 5, 5] + self.x1_shape = [5, 3, 5, 6] + self.x2_shape = [5, 3, 5, 7] if __name__ == '__main__': - from paddle import enable_static enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py new file mode 100644 index 0000000000000..c01f244004eff --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_flatten_mkldnn_op.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core + +from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16 + + +@OpTestTool.skip_if_not_cpu_bf16() +class TestFlattenOneDNNOp(OpTest): + def setUp(self): + self.set_op_type() + self.init_test_case() + self.set_inputs() + self.attrs = {"axis": self.axis, 'use_mkldnn': True} + self.ori_shape = self.inputs['X'].shape + self.outputs = {"Out": self.inputs["X"].copy().reshape(self.new_shape)} + + def set_inputs(self): + self.inputs = {"X": np.random.random(self.in_shape).astype("float32")} + + def set_op_type(self): + self.op_type = "flatten" + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 1 + self.new_shape = (3, 40) + + +class TestFlattenOneDNNOp1(TestFlattenOneDNNOp): + def init_test_case(self): + self.in_shape = (3, 2, 2, 10) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlattenOneDNNOpSixDims(TestFlattenOneDNNOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +class TestFlatten2OneDNNOp(TestFlattenOneDNNOp): + def set_op_type(self): + self.op_type = "flatten2" + + +class TestFlatten2OneDNNOp1(TestFlattenOneDNNOp1): + def set_op_type(self): + self.op_type = "flatten2" + + +class TestFlatten2OneDNNOpSixDims(TestFlattenOneDNNOpSixDims): + def set_op_type(self): + self.op_type = "flatten2" + + +# BF16 TESTS +def create_flatten_bf16_test_classes(parent): + class TestFlatten2BF16OneDNNOp(parent): + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = { + "X": np.random.random(self.in_shape).astype("uint16") + } + + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dx = np.reshape(self.dout, self.ori_shape) + + def test_check_output(self): + self.check_output_with_place( + core.CPUPlace(), no_check_set=["XShape"]) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[self.dout]) + + cls_name = "{0}_{1}".format(parent.__name__, "Flatten2_BF16") + TestFlatten2BF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestFlatten2BF16OneDNNOp + + class TestFlattenBF16OneDNNOp(parent): + def set_op_type(self): + self.dtype = np.uint16 + self.op_type = "flatten" + + def set_inputs(self): + self.dtype = np.uint16 + self.inputs = { + "X": np.random.random(self.in_shape).astype("uint16") + } + + def set_outputs(self): + self.outputs = {"Out": self.x.reshape(self.new_shape)} + + def calculate_grads(self): + self.dout = self.outputs['Out'] + self.dx = np.reshape(self.dout, self.ori_shape) + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace()) + + def test_check_grad(self): + self.calculate_grads() + self.check_grad_with_place( + core.CPUPlace(), ["X"], + "Out", + user_defined_grads=[self.dx], + user_defined_grad_outputs=[convert_float_to_uint16(self.dout)]) + + cls_name = "{0}_{1}".format(parent.__name__, "Flatten_BF16") + TestFlattenBF16OneDNNOp.__name__ = cls_name + globals()[cls_name] = TestFlattenBF16OneDNNOp + + +create_flatten_bf16_test_classes(TestFlatten2OneDNNOp) +create_flatten_bf16_test_classes(TestFlatten2OneDNNOp1) +create_flatten_bf16_test_classes(TestFlatten2OneDNNOpSixDims) + +if __name__ == "__main__": + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py index 5cc6651bb0ec8..994d78126bda5 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_matmul_v2_mkldnn_op.py @@ -235,6 +235,22 @@ def config(self): self.trans_y = True +class TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (1, 1, 2, 1, 8, 9) + self.y_shape = (9, 12) + self.trans_x = False + self.trans_y = False + + +class TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp(TestMatMulV2VectorXVectorOneDNNOp): + def config(self): + self.x_shape = (20, 5) + self.y_shape = (1, 2, 1, 5, 11) + self.trans_x = False + self.trans_y = False + + # BF16 TESTS def create_bf16_test_class(parent): @OpTestTool.skip_if_not_cpu_bf16() @@ -274,7 +290,8 @@ def calculate_grads(self): 2: [1, 0], 3: [0, 2, 1], 4: [0, 1, 3, 2], - 5: [0, 1, 2, 4, 3] + 5: [0, 1, 2, 4, 3], + 6: [0, 1, 2, 3, 5, 4] } # expand vector so it will be a valid matrix for multiplication @@ -370,6 +387,8 @@ def calculate_grads(self): create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeXTransposeYOneDNNOp) create_bf16_test_class(TestMatMulV2MatrixXMatrixTransposeY2OneDNNOp) create_bf16_test_class(TestMatMulV2MatrixXMatrix5DTranposeYOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix6Dx2DOneDNNOp) +create_bf16_test_class(TestMatMulV2MatrixXMatrix2Dx5DOneDNNOp) if __name__ == "__main__": paddle.enable_static() diff --git a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt index 44b3c6764a7cf..4e81bb9544ceb 100644 --- a/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/npu/CMakeLists.txt @@ -20,4 +20,5 @@ if (WITH_ASCEND_CL) set_tests_properties(test_stack_op_npu PROPERTIES TIMEOUT 300) set_tests_properties(test_conv2d_transpose_op_npu PROPERTIES TIMEOUT 200) set_tests_properties(test_conv2d_op_npu PROPERTIES TIMEOUT 300) + set_tests_properties(test_matmulv2_op_npu PROPERTIES TIMEOUT 300) endif() diff --git a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py index 1b8b13a0d27ea..877f9904f3407 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_batch_norm_op_npu.py @@ -45,6 +45,14 @@ def check_with_place(self, place, data_layout, dtype, shape): if len(shape) == 2: x_shape = shape c = x_shape[1] + if len(shape) == 3: + n, l, c = shape[0], shape[1], shape[2] + if data_layout == "NHWC": # NLC + x_shape = [n, l, c] + elif data_layout == "NCHW": # NCL + x_shape = [n, c, l] + else: + raise ValueError("Unknown data layout.") else: n, h, w, c = shape[0], shape[1], shape[2], shape[3] if data_layout == "NHWC": @@ -117,6 +125,7 @@ def test_check_output(self): place = core.NPUPlace(0) for data_format in self.data_formats: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) + self.check_with_place(place, data_format, self.dtype, [3, 8, 5]) def init_kernel_type(self): pass @@ -185,10 +194,19 @@ def test_with_place(place, data_layout, shape): # attr epsilon = self.epsilon momentum = self.momentum - if data_layout == "NCHW": - n, c, h, w = shape[0], shape[1], shape[2], shape[3] + + if len(shape) == 3: + if data_layout == "NHWC": # NLC + n, l, c = shape[0], shape[1], shape[2] + elif data_layout == "NCHW": # NCL + n, c, l = shape[0], shape[1], shape[2] + else: + raise ValueError("Unknown data layout.") else: - n, h, w, c = shape[0], shape[1], shape[2], shape[3] + if data_layout == "NCHW": + n, c, h, w = shape[0], shape[1], shape[2], shape[3] + else: + n, h, w, c = shape[0], shape[1], shape[2], shape[3] scale_shape = [c] np.random.seed(123) @@ -296,6 +314,7 @@ def test_with_place(place, data_layout, shape): for data_format in self.data_formats: test_with_place(core.NPUPlace(0), data_format, [2, 3, 4, 5]) + test_with_place(core.NPUPlace(0), data_format, [3, 8, 5]) def init_kernel_type(self): pass @@ -328,6 +347,17 @@ def init_test_case(self): ] def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) y_grad = np.transpose(y_grad, (0, 2, 3, 1)) @@ -343,6 +373,9 @@ def reference_grad(self, x, y_grad, scale, mean, var, epsilon, data_format): x = np.transpose(x, (0, 3, 1, 2)) y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + if len(x_shape) == 3: + x_grad = np.reshape(x_grad, x_shape) + return x_grad, grad_scale, grad_offset def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, @@ -350,6 +383,17 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, if data_layout != "NCHW" and data_layout != "NHWC": raise ValueError("Unknown data order.") + x_shape = x.shape + if len(x_shape) == 3: + if data_layout == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, + (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_layout == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) @@ -369,6 +413,10 @@ def ref_forward_backward(self, x, y_grad, scale, bias, mean, variance, x_grad, scale_grad, bias_grad = self.reference_grad( x, y_grad, scale, mean, variance, epsilon, data_layout) + if len(x_shape) == 3: + y = np.reshape(y, x_shape) + x_grad = np.reshape(x_grad, x_shape) + return y, mean_out, variance_out, mean, saved_variance, x_grad, scale_grad, bias_grad diff --git a/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py new file mode 100644 index 0000000000000..4d4d61ace841e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_box_coder_op_npu.py @@ -0,0 +1,252 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +import math +import paddle +from op_test import OpTest + +paddle.enable_static() + +np.random.seed(2021) + + +def box_decoder(t_box, p_box, pb_v, output_box, norm, axis=0): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + + shape = (1, p_box.shape[0]) if axis == 0 else (p_box.shape[0], 1) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + var_shape = (1, pb_v.shape[0], pb_v.shape[1]) if axis == 0 else ( + pb_v.shape[0], 1, pb_v.shape[1]) + pb_v = pb_v.reshape(var_shape) + if pb_v.ndim == 1: + tb_x = pb_v[0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[3] * t_box[:, :, 3]) * pb_h + else: + tb_x = pb_v[:, :, 0] * t_box[:, :, 0] * pb_w + pb_x + tb_y = pb_v[:, :, 1] * t_box[:, :, 1] * pb_h + pb_y + tb_w = np.exp(pb_v[:, :, 2] * t_box[:, :, 2]) * pb_w + tb_h = np.exp(pb_v[:, :, 3] * t_box[:, :, 3]) * pb_h + output_box[:, :, 0] = tb_x - tb_w / 2 + output_box[:, :, 1] = tb_y - tb_h / 2 + output_box[:, :, 2] = tb_x + tb_w / 2 - (not norm) + output_box[:, :, 3] = tb_y + tb_h / 2 - (not norm) + + +def box_encoder(t_box, p_box, pb_v, output_box, norm): + pb_w = p_box[:, 2] - p_box[:, 0] + (norm == False) + pb_h = p_box[:, 3] - p_box[:, 1] + (norm == False) + pb_x = pb_w * 0.5 + p_box[:, 0] + pb_y = pb_h * 0.5 + p_box[:, 1] + shape = (1, p_box.shape[0]) + + pb_w = pb_w.reshape(shape) + pb_h = pb_h.reshape(shape) + pb_x = pb_x.reshape(shape) + pb_y = pb_y.reshape(shape) + + if pb_v.ndim == 2: + pb_v = pb_v.reshape(1, pb_v.shape[0], pb_v.shape[1]) + tb_x = ((t_box[:, 2] + t_box[:, 0]) / 2).reshape(t_box.shape[0], 1) + tb_y = ((t_box[:, 3] + t_box[:, 1]) / 2).reshape(t_box.shape[0], 1) + tb_w = (t_box[:, 2] - t_box[:, 0]).reshape(t_box.shape[0], 1) + (not norm) + tb_h = (t_box[:, 3] - t_box[:, 1]).reshape(t_box.shape[0], 1) + (not norm) + if pb_v.ndim == 1: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[3] + else: + output_box[:, :, 0] = (tb_x - pb_x) / pb_w / pb_v[:, :, 0] + output_box[:, :, 1] = (tb_y - pb_y) / pb_h / pb_v[:, :, 1] + output_box[:, :, 2] = np.log(np.fabs(tb_w / pb_w)) / pb_v[:, :, 2] + output_box[:, :, 3] = np.log(np.fabs(tb_h / pb_h)) / pb_v[:, :, 3] + + +def batch_box_coder(p_box, pb_v, t_box, lod, code_type, norm, axis=0): + n = t_box.shape[0] + m = p_box.shape[0] + if code_type == "decode_center_size": + m = t_box.shape[1] + output_box = np.zeros((n, m, 4), dtype=np.float32) + cur_offset = 0 + + for i in range(len(lod)): + if (code_type == "encode_center_size"): + box_encoder(t_box[cur_offset:(cur_offset + lod[i]), :], p_box, pb_v, + output_box[cur_offset:(cur_offset + lod[i]), :, :], + norm) + elif (code_type == "decode_center_size"): + box_decoder(t_box, p_box, pb_v, output_box, norm, axis) + cur_offset += lod[i] + return output_box + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestBoxCoderOp(OpTest): + def setUp(self): + self.op_type = "box_coder" + self.set_npu() + self.init_dtype() + + self.set_init_config() + self.set_inputs() + self.set_attrs() + self.set_outputs() + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + def set_init_config(self): + self.M = 81 + self.N = 20 + self.code_type = 'decode_center_size' + self.box_normalized = False + self.lod = [[1, 1, 1, 1, 1]] + self.axis = 0 + self.use_variance = False + self.without_prior_box_var = False + self.atol = 1e-5 + + def set_inputs(self): + self.inputs = {} + assert (self.code_type in ['decode_center_size', 'encode_center_size']) + assert (self.axis in [0, 1]) + if self.code_type == 'decode_center_size': + assert (not self.use_variance or not self.without_prior_box_var) + + self.prior_box = np.random.random((self.M, 4)).astype(self.dtype) + + if self.use_variance: + self.prior_box_var = np.random.random(4).astype(self.dtype) + else: + if self.without_prior_box_var: + self.prior_box_var = np.ones((self.M, 4)).astype(self.dtype) + else: + self.prior_box_var = np.random.random( + (self.M, 4)).astype(self.dtype) + + if self.axis == 0: + self.target_box = np.random.random( + (self.N, self.M, 4)).astype(self.dtype) + else: + self.target_box = np.random.random( + (self.M, self.N, 4)).astype(self.dtype) + self.inputs['PriorBox'] = self.prior_box + self.inputs['TargetBox'] = self.target_box + if (not self.use_variance and not self.without_prior_box_var): + self.inputs['PriorBoxVar'] = self.prior_box_var + else: + #encode_center_size + self.prior_box = np.random.random((self.M, 4)).astype(self.dtype) + if self.use_variance: + self.prior_box_var = np.random.random(4).astype(self.dtype) + else: + self.prior_box_var = np.random.random( + (self.M, 4)).astype(self.dtype) + self.target_box = np.random.random((self.N, 4)).astype(self.dtype) + self.inputs['PriorBox'] = self.prior_box + #self.inputs['PriorBoxVar'] = self.prior_box_var + self.inputs['TargetBox'] = (self.target_box, self.lod) + if (not self.use_variance): + self.inputs['PriorBoxVar'] = self.prior_box_var + + def set_attrs(self): + self.attrs = { + 'code_type': self.code_type, + 'box_normalized': self.box_normalized + } + if self.use_variance: + self.attrs['variance'] = self.prior_box_var.astype( + np.float).flatten() + if self.axis != 0: + self.attrs['axis'] = self.axis + + def set_outputs(self): + output_box = batch_box_coder( + self.prior_box, self.prior_box_var, self.target_box, self.lod[0], + self.code_type, self.box_normalized, self.axis) + self.outputs = {'OutputBox': output_box.astype(self.dtype)} + + def test_check_output(self): + self.check_output_with_place(self.place, atol=self.atol) + + +class TestBoxCoderOpWithoutBoxVar(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithoutBoxVar, self).set_init_config() + self.without_prior_box_var = True + self.lod = [[0, 1, 2, 3, 4, 5]] + + +class TestBoxCoderOpWithLoD(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithLoD, self).set_init_config() + self.M = 20 + self.N = 50 + self.lod = [[10, 20, 20]] + self.code_type = 'encode_center_size' + self.box_normalized = True + + +class TestBoxCoderOpWithLoDWithVariance(TestBoxCoderOpWithLoD): + def set_init_config(self): + super(TestBoxCoderOpWithLoDWithVariance, self).set_init_config() + self.use_variance = True + + +class TestBoxCoderOpWithAxis(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithAxis, self).set_init_config() + self.axis = 1 + + +class TestBoxCoderOpWithVariance(TestBoxCoderOp): + def set_init_config(self): + super(TestBoxCoderOpWithVariance, self).set_init_config() + self.use_variance = True + + +class TestBoxCoderOpFP16(TestBoxCoderOp): + def init_dtype(self): + self.dtype = np.float16 + + def set_init_config(self): + super(TestBoxCoderOpFP16, self).set_init_config() + self.atol = 1e-2 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py index ea94661e8a51e..92bbc9f536d13 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_mul_op_npu.py @@ -18,147 +18,203 @@ import unittest import sys sys.path.append("..") -from op_test import OpTest +from op_test import OpTest, skip_check_grad_ci import paddle import paddle.fluid as fluid paddle.enable_static() -SEED = 2021 -class TestElementwiseMul(OpTest): +class ElementwiseMulOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def setUp(self): self.set_npu() self.op_type = "elementwise_mul" - self.place = paddle.NPUPlace(0) - + self.dtype = np.float32 + self.axis = -1 self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) - out = np.multiply(x, y) + self.init_input_output() + self.init_axis() self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) + 'X': OpTest.np_dtype_to_fluid_dtype(self.x), + 'Y': OpTest.np_dtype_to_fluid_dtype(self.y) } - self.attrs = {} - self.outputs = {'Out': out} + self.outputs = {'Out': self.out} + self.attrs = {'axis': self.axis} - def set_npu(self): - self.__class__.use_npu = True + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') + + def test_check_grad_ingore_x(self): + self.check_grad_with_place( + self.place, ['Y'], 'Out', no_grad_set=set("X")) + + def test_check_grad_ingore_y(self): + self.check_grad_with_place( + self.place, ['X'], 'Out', no_grad_set=set('Y')) + + def init_input_output(self): + self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype) + self.out = np.multiply(self.x, self.y) def init_dtype(self): - self.dtype = np.float32 + pass - def test_check_output(self): - self.check_output_with_place(self.place) + def init_axis(self): + pass - # TODO(ascendrc): Mul grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseMulOp_scalar(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 3, 4).astype(np.float32), + 'Y': np.random.rand(1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} -class TestElementwiseMulFp16(OpTest): +class TestElementwiseMulOp_Vector(ElementwiseMulOp): def setUp(self): self.set_npu() self.op_type = "elementwise_mul" - self.place = paddle.NPUPlace(0) + self.inputs = { + 'X': np.random.random((100, )).astype("float32"), + 'Y': np.random.random((100, )).astype("float32") + } + self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])} - self.init_dtype() - np.random.seed(SEED) - x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) - out = np.multiply(x, y) +class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x * self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" self.inputs = { - 'X': OpTest.np_dtype_to_fluid_dtype(x), - 'Y': OpTest.np_dtype_to_fluid_dtype(y) + 'X': np.random.rand(2, 100, 3).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 100, 1) + } + + +class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(100).astype(np.float32) + } + + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 1, 100) } - self.attrs = {} - self.outputs = {'Out': out} - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True +class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 10, 12, 3).astype(np.float32), + 'Y': np.random.rand(10, 12).astype(np.float32) + } + + self.attrs = {'axis': 1} + self.outputs = { + 'Out': self.inputs['X'] * self.inputs['Y'].reshape(1, 10, 12, 1) + } + + +class TestElementwiseMulOp_broadcast_4(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 2, 11).astype(np.float32), + 'Y': np.random.rand(10, 1, 11).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_broadcast_5(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 4, 2, 3).astype(np.float32), + 'Y': np.random.rand(10, 4, 1, 3).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "paddle is not compiled with NPU") +class TestElementwiseMulOpFp16(ElementwiseMulOp): def init_dtype(self): self.dtype = np.float16 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) - - -class TestElementwiseMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(32, 32)).astype('float32') - b_np = np.random.random(size=(32, 32)).astype('float32') - c_np = np.random.random(size=(32, 32)).astype('float32') - d_np = np.random.random(size=(32, 32)).astype('float32') - label_np = np.random.randint(2, size=(32, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') - b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') - c = paddle.static.data(name="c", shape=[32, 32], dtype='float32') - d = paddle.static.data(name="d", shape=[32, 32], dtype='float32') - label = paddle.static.data( - name="label", shape=[32, 1], dtype='int64') - - e = paddle.multiply(a, b) - f = paddle.multiply(c, d) - f.stop_gradient = True - g = paddle.multiply(e, f) - - fc_1 = fluid.layers.fc(input=g, size=128) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) + +class TestElementwiseMulOp_commonuse_1(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(2, 3, 100).astype(np.float32), + 'Y': np.random.rand(1, 1, 100).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_commonuse_2(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(30, 3, 1, 5).astype(np.float32), + 'Y': np.random.rand(30, 1, 4, 1).astype(np.float32) + } + self.outputs = {'Out': self.inputs['X'] * self.inputs['Y']} + + +class TestElementwiseMulOp_xsize_lessthan_ysize(ElementwiseMulOp): + def setUp(self): + self.set_npu() + self.op_type = "elementwise_mul" + self.inputs = { + 'X': np.random.rand(10, 10).astype(np.float32), + 'Y': np.random.rand(2, 2, 10, 10).astype(np.float32) + } + + self.attrs = {'axis': 2} + + self.outputs = { + 'Out': self.inputs['X'].reshape(1, 1, 10, 10) * self.inputs['Y'] + } if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py index 6faa77b460213..7c8710fd42b36 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_sub_op_npu.py @@ -90,6 +90,11 @@ def test_check_output(self): # max_relative_error=0.006,) +class TestElementwiseSubOpInt32(TestElementwiseSubOp): + def init_dtype(self): + self.dtype = np.int32 + + class TestSubtractAPI(unittest.TestCase): def test_name(self): with paddle.static.program_guard(paddle.static.Program()): diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py index a687509e6ae9c..c3074db1aaff6 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py @@ -57,6 +57,12 @@ def init(self): self.value = -1 +class TestFillAnyLikeNPUOpInt64(TestFillAnyLikeNPUOp): + def init(self): + self.dtype = np.int64 + self.value = -1 + + class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp): def init(self): self.dtype = np.float32 diff --git a/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py new file mode 100644 index 0000000000000..9ab1161be36dd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_group_norm_op_npu.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import unittest +import numpy as np + +import sys +sys.path.append("..") + +from operator import mul +from op_test import OpTest +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + +paddle.enable_static() + + +def group_norm_naive(x, scale, bias, epsilon, groups, data_layout): + if data_layout == "NHWC": + x = np.transpose(x, (0, 3, 1, 2)) # NHWC => NCHW + N, C, H, W = x.shape + G = groups + x = x.reshape((N * G, -1)) + mean = np.mean(x, axis=1, keepdims=True) + var = np.var(x, axis=1, keepdims=True) + xnorm = (x - mean) / np.sqrt(var + epsilon) + xnorm = xnorm.reshape((N, C, H, W)) + output = xnorm * scale.reshape((-1, 1, 1)) + bias.reshape((-1, 1, 1)) + if data_layout == "NHWC": + output = np.transpose(output, (0, 2, 3, 1)) # NCHW => NHWC + xnorm = np.transpose(xnorm, (0, 2, 3, 1)) + return output, mean.reshape((N, G)), var.reshape((N, G)) + + +class TestGroupNormOpError(unittest.TestCase): + def test_errors(self): + with fluid.program_guard(fluid.Program(), fluid.Program()): + + def test_x_type(): + input = np.random.random(2, 100, 3, 5).astype('float32') + groups = 2 + fluid.layers.group_norm(input, groups) + + self.assertRaises(TypeError, test_x_type) + + def test_x_dtype(): + x2 = fluid.layers.data( + name='x2', shape=[2, 100, 3, 5], dtype='int32') + groups = 2 + fluid.layers.group_norm(x2, groups) + + self.assertRaises(TypeError, test_x_dtype) + + +class TestGroupNormOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = 'group_norm' + self.place = paddle.NPUPlace(0) + + self.init_dtype() + + self.data_format = "NCHW" + self.atol = 1e-6 + self.max_relative_error = 0.005 + self.shape = (2, 100, 3, 5) + self.attrs = {'epsilon': 1e-5, 'groups': 2, 'data_layout': "NCHW"} + self.compare_between_place = False + self.init_test_case() + + input = np.random.random(self.shape).astype(self.dtype) + if self.data_format == "NHWC": + input = np.transpose(input, (0, 2, 3, 1)) + scale = np.random.random([self.shape[1]]).astype(self.dtype) + bias = np.random.random([self.shape[1]]).astype(self.dtype) + output, mean, var = group_norm_naive( + input, scale, bias, self.attrs['epsilon'], self.attrs['groups'], + self.data_format) + + self.inputs = { + 'X': OpTest.np_dtype_to_fluid_dtype(input), + 'Scale': OpTest.np_dtype_to_fluid_dtype(scale), + 'Bias': OpTest.np_dtype_to_fluid_dtype(bias) + } + self.outputs = {'Y': output, 'Mean': mean, 'Variance': var} + self.attrs['data_layout'] = self.data_format + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=self.atol) + + def test_check_grad(self): + if self.dtype == np.float16: + return + + self.__class__.exist_check_grad = True + inputs_to_check = ['X', 'Scale', 'Bias'] + output_names = 'Y' + no_grad_set = set() + cpu_place = fluid.CPUPlace() + cpu_grads = self._get_gradient(inputs_to_check, cpu_place, output_names, + no_grad_set) + npu_grads = self._get_gradient(inputs_to_check, self.place, + output_names, no_grad_set) + + self._assert_is_close(cpu_grads, npu_grads, inputs_to_check, + self.max_relative_error, + "Gradient Check between places") + + def init_test_case(self): + pass + + +class TestGroupNormOp1(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + + +class TestGroupNormOp2(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + + +class TestGroupNormOpBigEps1(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOpBigEps2(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOpBigEps3(TestGroupNormOp): + def init_test_case(self): + self.attrs['epsilon'] = 0.5 + + +class TestGroupNormOp1_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.data_format = "NHWC" + + +class TestGroupNormOp2_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps1_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 1 + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps2_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['groups'] = 4 + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpBigEps3_With_NHWC(TestGroupNormOp): + def init_test_case(self): + self.attrs['epsilon'] = 0.5 + self.data_format = "NHWC" + + +class TestGroupNormOpFP16(TestGroupNormOp): + def init_dtype(self): + self.dtype = np.float16 + + +class TestGroupNormOpFP16_With_NHWC(TestGroupNormOp): + def init_dtype(self): + self.dtype = np.float16 + + def init_test_case(self): + self.data_format = "NHWC" + + +class TestGroupNormException(unittest.TestCase): + # data_layout is not NHWC or NCHW + def test_exception(self): + data = fluid.data(name='data', shape=[None, 3, 3, 4], dtype="float64") + + def attr_data_format(): + out = fluid.layers.group_norm( + input=data, groups=2, data_layout="NDHW") + + self.assertRaises(ValueError, attr_data_format) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py index 53766c5eb61b7..882043ef6eb91 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_matmulv2_op_npu.py @@ -21,56 +21,35 @@ from op_test import OpTest import paddle import paddle.fluid as fluid +from test_matmul_v2_op import reference_matmul paddle.enable_static() SEED = 2021 -def reference_matmul(X, Y, transpose_X=False, transpose_Y=False): - """Reference forward implementation using np.matmul.""" - # np.matmul does not support the transpose flags, so we manually - # transpose X and Y appropriately. - if transpose_X: - if X.ndim == 1: - X = X.reshape((X.size)) - elif X.ndim == 2: - X = X.T - else: - dim = [i for i in range(len(X.shape))] - dim[-1], dim[len(X.shape) - 2] = dim[len(X.shape) - 2], dim[-1] - X = np.transpose(X, tuple(dim)) - if transpose_Y: - if Y.ndim == 1: - Y = Y.reshape((Y.size)) - else: - dim = [i for i in range(len(Y.shape))] - dim[-1], dim[len(Y.shape) - 2] = dim[len(Y.shape) - 2], dim[-1] - Y = np.transpose(Y, tuple(dim)) - - Out = np.matmul(X, Y) - if not Out.shape: - # We do not support 0-dimensional Tensors (scalars). So where - # np.matmul outputs a scalar, we must convert to a Tensor of - # shape (1) instead. - # Everywhere else, we are compatible with np.matmul. - Out = np.array([Out], dtype="float64") - return Out - - -class TestMatMul(OpTest): +class TestMatMulV2Op(OpTest): + """ + case 1 + """ + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + def config(self): - self.x_shape = (100, 24) - self.y_shape = (24, 100) + self.x_shape = (100, ) + self.y_shape = (100, ) self.trans_x = False self.trans_y = False + def init_kernel_type(self): + self.dtype = "float32" + def setUp(self): self.set_npu() - self.op_type = "matmul_v2" - self.place = paddle.NPUPlace(0) - self.init_dtype() + self.init_kernel_type() self.config() - np.random.seed(SEED) + self.op_type = "matmul_v2" x = np.random.random(self.x_shape).astype(self.dtype) y = np.random.random(self.y_shape).astype(self.dtype) # -0.1 ~ 0.1 @@ -85,201 +64,314 @@ def setUp(self): self.attrs = {'trans_x': self.trans_x, 'trans_y': self.trans_y} self.outputs = {'Out': result} - def set_npu(self): - self.__class__.use_npu = True - self.__class__.no_need_check_grad = True - - def init_dtype(self): - self.dtype = np.float32 - def test_check_output(self): - self.check_output_with_place(self.place, atol=1e-5) + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X', 'Y'], 'Out') - # TODO(ascendrc): Add grad test - # def test_check_grad(self): - # if self.dtype == np.float16: - # return - # self.check_grad(['X'], 'Out') - # -class TestMatMul2(TestMatMul): +class TestMatMuklOp2(TestMatMulV2Op): """ case 2 """ def config(self): - self.x_shape = (32, 24) - self.y_shape = (32, 24) + self.x_shape = (100, ) + self.y_shape = (1, 3, 2, 100) self.trans_x = False self.trans_y = True -class TestMatMul3(TestMatMul): +class TestMatMuklOp3(TestMatMulV2Op): """ case 3 """ - def init_dtype(self): - self.dtype = np.float16 + def config(self): + self.x_shape = (100, ) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False -class TestMatMul4(TestMatMul): +class TestMatMuklOp4(TestMatMulV2Op): """ - case 4 dim=3 + case 4 """ def config(self): - self.x_shape = (2, 3, 4) - self.y_shape = (2, 4, 3) + self.x_shape = (100, ) + self.y_shape = (1, 2, 100, 2) self.trans_x = False self.trans_y = False -class TestMatMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(2, 3)).astype('float32') - b_np = np.random.random(size=(2, 3)).astype('float32') - c_np = np.random.random(size=(3, 2)).astype('float32') - d_np = np.random.random(size=(3, 2)).astype('float32') - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') - b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') - c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') - d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - result = paddle.matmul(sum_1, sum_2) - - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) - - -# The precision is aligned in NPU and GPU separately, which is only used for the usage method. - - -class TestMatMulNet3_2(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - self._dtype = "float32" - - a_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - b_np = np.random.random(size=(2, 1, 3)).astype(self._dtype) - c_np = np.random.random(size=(3, 2)).astype(self._dtype) - d_np = np.random.random(size=(3, 2)).astype(self._dtype) - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 1, 3], dtype=self._dtype) - b = paddle.static.data(name="b", shape=[2, 1, 3], dtype=self._dtype) - c = paddle.static.data(name="c", shape=[3, 2], dtype=self._dtype) - d = paddle.static.data(name="d", shape=[3, 2], dtype=self._dtype) - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - sum_1 = paddle.cast(sum_1, 'float16') - sum_2 = paddle.cast(sum_2, 'float16') - if not run_npu: - sum_1 = paddle.cast(sum_1, 'float32') - sum_2 = paddle.cast(sum_2, 'float32') - - result = paddle.matmul(sum_1, sum_2) - if run_npu: - result = paddle.cast(result, 'float32') - - result = paddle.reshape(result, shape=[2, 2]) - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: +class TestMatMuklOp5(TestMatMulV2Op): + """ + case 5 + """ + + def config(self): + self.x_shape = (1, 1, 100, 1) + self.y_shape = (100, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp6(TestMatMulV2Op): + """ + case 6 + """ + + def config(self): + self.x_shape = (1, 2, 102, 1) + self.y_shape = (102, ) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp7(TestMatMulV2Op): + """ + case 7 + """ + + def config(self): + self.x_shape = (1, 2, 1, 100) + self.y_shape = (100, ) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp8(TestMatMulV2Op): + """ + case 8 + """ + + def config(self): + self.x_shape = (1, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp9(TestMatMulV2Op): + """ + case 9 + """ + + def config(self): + self.x_shape = (1, 1, 1, 100) + self.y_shape = (2, 1, 2, 100) + self.trans_x = False + self.trans_y = True + + +class TestMatMuklOp10(TestMatMulV2Op): + """ + case 10 + """ + + def config(self): + self.x_shape = (1, 1, 25, 4) + self.y_shape = (1, 2, 4, 25) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp11(TestMatMulV2Op): + """ + case 11 + """ + + def config(self): + self.x_shape = (2, 1, 2, 100) + self.y_shape = (1, 1, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp12(TestMatMulV2Op): + """ + case 12 + """ + + def config(self): + self.x_shape = (2, 1, 4, 25) + self.y_shape = (1, 1, 4, 25) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp13(TestMatMulV2Op): + """ + case 13 + """ + + def config(self): + self.x_shape = (2, 2, 10, 10) + self.y_shape = (2, 2, 10, 10) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp14(TestMatMulV2Op): + """ + case 14_1 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = True + self.trans_y = False + + +class TestMatMuklOp15(TestMatMulV2Op): + """ + case 14_2 + """ + + def config(self): + self.x_shape = (3, 1, 6, 6) + self.y_shape = (1, 2, 6, 9) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp16(TestMatMulV2Op): + """ + case 16 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (100) + self.y_shape = (1, 2, 2, 100, 2) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOp17(TestMatMulV2Op): + """ + case 17 : to check the gradient for special case + """ + + def config(self): + self.x_shape = (2, 1, 100) + self.y_shape = (100) + self.trans_x = False + self.trans_y = False + + +class TestMatMuklOpBroadcast1(TestMatMulV2Op): + """ + case 14_3 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = True + self.trans_y = True + + +class TestMatMuklOpBroadcast2(TestMatMulV2Op): + """ + case 14_4 + """ + + def config(self): + self.x_shape = (3, 1, 10, 10) + self.y_shape = (1, 2, 10, 10) + self.trans_x = False + self.trans_y = True + + +#--------------------test matmul fp16-------------------- + + +def create_test_fp16_class(parent, atol=0.001, max_relative_error=2.5): + class TestMatMulOpFp16Case(parent): + def init_kernel_type(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=atol) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X', 'Y'], + 'Out', + max_relative_error=max_relative_error) + + cls_name = "{0}_{1}".format(parent.__name__, "Fp16") + TestMatMulOpFp16Case.__name__ = cls_name + globals()[cls_name] = TestMatMulOpFp16Case + + +create_test_fp16_class(TestMatMulV2Op) +create_test_fp16_class(TestMatMuklOp2) +create_test_fp16_class(TestMatMuklOp3) +create_test_fp16_class(TestMatMuklOp4) +create_test_fp16_class(TestMatMuklOp5) +create_test_fp16_class(TestMatMuklOp6) +create_test_fp16_class(TestMatMuklOp7) +create_test_fp16_class(TestMatMuklOp8) +create_test_fp16_class(TestMatMuklOp9) +create_test_fp16_class(TestMatMuklOp10) +create_test_fp16_class(TestMatMuklOp11) +create_test_fp16_class(TestMatMuklOp12) +create_test_fp16_class(TestMatMuklOp13) +create_test_fp16_class(TestMatMuklOp14) +create_test_fp16_class(TestMatMuklOp15) +create_test_fp16_class(TestMatMuklOp16) +create_test_fp16_class(TestMatMuklOp17) + + +class TestMatMulV2API(unittest.TestCase): + def setUp(self): + self.places = [paddle.CPUPlace()] + if paddle.is_compiled_with_npu(): + self.places.append(paddle.NPUPlace(0)) + + def check_static_result(self, place): + with fluid.program_guard(fluid.Program(), fluid.Program()): + input_x = fluid.data(name="input_x", shape=[4, 3], dtype="float32") + input_y = fluid.data(name="input_y", shape=[3, 4], dtype="float32") + + result = paddle.matmul(input_x, input_y) + + x_np = np.random.random([4, 3]).astype("float32") + y_np = np.random.random([3, 4]).astype("float32") + + exe = fluid.Executor(place) + fetches = exe.run(fluid.default_main_program(), + feed={"input_x": x_np, + "input_y": y_np}, + fetch_list=[result]) + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def test_dygraph(self): + for place in self.places: + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float32") + input_y = np.random.random([3, 4]).astype("float32") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) + + def test_dygraph_fp16(self): + if paddle.is_compiled_with_npu(): place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred, atol=1e-4)) - self.assertTrue(np.allclose(npu_loss, cpu_loss, atol=1e-4)) + with fluid.dygraph.guard(place): + input_x = np.random.random([4, 3]).astype("float16") + input_y = np.random.random([3, 4]).astype("float16") + x = paddle.to_tensor(input_x) + y = paddle.to_tensor(input_y) + result = paddle.matmul(x, y) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py index e819f422f2b44..421ea1df4cff0 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_set_value_op_npu.py @@ -23,13 +23,15 @@ import paddle.fluid as fluid from paddle.fluid import core -SEED = 2021 - class TestSetValueBase(unittest.TestCase): - def set_input(self): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + paddle.enable_static() self.set_npu() - paddle.device.set_device('npu') self.set_dtype() self.set_value() self.set_shape() @@ -51,9 +53,6 @@ def _call_setitem(self, x): def _get_answer(self): self.data[0, 0] = self.value - def set_npu(self): - self.__class__.use_npu = True - class TestSetValueApi(TestSetValueBase): def _run_static(self): @@ -62,13 +61,13 @@ def _run_static(self): x = paddle.ones(shape=self.shape, dtype=self.dtype) self._call_setitem(x) - exe = paddle.static.Executor(paddle.NPUPlace(0)) + exe = paddle.static.Executor(self.place) out = exe.run(self.program, fetch_list=[x]) paddle.disable_static() return out def _run_dynamic(self): - paddle.disable_static(paddle.NPUPlace(0)) + paddle.disable_static(self.place) x = paddle.ones(shape=self.shape, dtype=self.dtype) self._call_setitem(x) out = x.numpy() @@ -76,7 +75,6 @@ def _run_dynamic(self): return out def test_api(self): - self.set_input() static_out = self._run_static() dynamic_out = self._run_dynamic() self._get_answer() @@ -134,23 +132,22 @@ def _get_answer(self): self.data[0:, 1:2, :] = self.value -""" FIXEME : it seams that NPU don't support while operator ??? -class TestSetValueItemSliceInWhile(TestSetValueApi): - def _call_setitem(self, x): - def cond(i, x): - return i < 1 +# TODO(qili93): Fix this after NPU support while_loop +# class TestSetValueItemSliceInWhile(TestSetValueApi): +# def _call_setitem(self, x): +# def cond(i, x): +# return i < 1 - def body(i, x): - x[i] = self.value - i = i + 1 - return i, x - with paddle.static.device_guard("npu"): - i = paddle.zeros(shape=(1, ), dtype='int32') - i, x = paddle.fluid.layers.while_loop(cond, body, [i, x]) +# def body(i, x): +# x[i] = self.value +# i = i + 1 +# return i, x - def _get_answer(self): - self.data[0] = self.value -""" +# i = paddle.zeros(shape=(1, ), dtype='int32') +# i, x = paddle.fluid.layers.while_loop(cond, body, [i, x]) + +# def _get_answer(self): +# self.data[0] = self.value # 1.2.2 step > 1 @@ -192,6 +189,60 @@ def _get_answer(self): self.data[0:, 1:2:2, :] = self.value +# 1.2.3 step < 0 +class TestSetValueItemSliceNegetiveStep(TestSetValueApi): + def set_shape(self): + self.shape = [5, 2] + + def set_value(self): + self.value = np.array([3, 4]) + + def _call_setitem(self, x): + x[5:2:-1] = self.value + + def _get_answer(self): + self.data[5:2:-1] = self.value + + +class TestSetValueItemSliceNegetiveStep2(TestSetValueApi): + def set_shape(self): + self.shape = [5] + + def set_value(self): + self.value = np.array([3, 4]) + + def _call_setitem(self, x): + x[1::-1] = self.value + + def _get_answer(self): + self.data[1::-1] = self.value + + +class TestSetValueItemSliceNegetiveStep3(TestSetValueApi): + def set_shape(self): + self.shape = [3] + + def set_value(self): + self.value = np.array([3, 4, 5]) + + def _call_setitem(self, x): + x[::-1] = self.value + + def _get_answer(self): + self.data[::-1] = self.value + + +class TestSetValueItemSliceNegetiveStep4(TestSetValueApi): + def set_shape(self): + self.shape = [3, 4, 5] + + def _call_setitem(self, x): + x[2:0:-1, 0:2, ::-1] = self.value + + def _get_answer(self): + self.data[2:0:-1, 0:2, ::-1] = self.value + + # 1.3 item is Ellipsis @@ -277,6 +328,19 @@ def _get_answer(self): self.data[0:, 1:2:2, :] = self.value +class TestSetValueItemTensor6(TestSetValueApi): + def set_shape(self): + self.shape = [3, 4, 5] + + def _call_setitem(self, x): + minus1 = paddle.full([1], -1, dtype="int32") + zero = paddle.full([1], 0, dtype="int32") + x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value + + def _get_answer(self): + self.data[2:0:-1, 0:2, ::-1] = self.value + + # 1.5 item is None class TestSetValueItemNone1(TestSetValueApi): def _call_setitem(self, x): @@ -350,133 +414,99 @@ def _get_answer(self): self.data[None, :, 1, ..., None] = np.zeros(self.shape)[0, 0, :, None] -""" FIXME : current NPU set_value don't support negative step !!! - @xiongkun03 +# 1.5 item is list or Tensor of bol +class TestSetValueItemBool1(TestSetValueApi): + def _call_setitem(self, x): + x[[True, False]] = self.value -class TestSetValueItemTensor6(TestSetValueApi): - def set_shape(self): - self.shape = [3, 4, 5] + def _get_answer(self): + self.data[[True, False]] = self.value + +class TestSetValueItemBool2(TestSetValueApi): def _call_setitem(self, x): - minus1 = paddle.full([1], -1, dtype="int32") - zero = paddle.full([1], 0, dtype="int32") - x[2:zero:minus1, 0:2, 10:-6:minus1] = self.value + x[[False, False]] = self.value def _get_answer(self): - self.data[2:0:-1, 0:2, ::-1] = self.value -""" + self.data[[False, False]] = self.value -# 2. Test different type of value: int, float, numpy.ndarray, Tensor -# 2.1 value is int32, int64, float32, float64, bool +class TestSetValueItemBool3(TestSetValueApi): + def _call_setitem(self, x): + x[[False, True]] = np.zeros(self.shape[2]) -def create_test_value_int32(parent): - class TestValueInt(parent): - def set_value(self): - self.value = 7 + def _get_answer(self): + self.data[[False, True]] = np.zeros(self.shape[2]) - def set_dtype(self): - self.dtype = "int32" - - cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt +class TestSetValueItemBool4(TestSetValueApi): + def _call_setitem(self, x): + idx = paddle.assign(np.array([False, True])) + x[idx] = np.zeros(self.shape[2]) -create_test_value_int32(TestSetValueItemInt) -create_test_value_int32(TestSetValueItemSlice) -create_test_value_int32(TestSetValueItemSlice2) -create_test_value_int32(TestSetValueItemSlice3) -create_test_value_int32(TestSetValueItemSlice4) + def _get_answer(self): + self.data[np.array([False, True])] = np.zeros(self.shape[2]) -def create_test_value_numpy_fp32(parent): - class TestValueInt(parent): - def set_value(self): - self.value = np.array([1]) +class TestSetValueItemBool5(TestSetValueApi): + def _call_setitem(self, x): + idx = paddle.assign( + np.array([[False, True, False], [True, True, False]])) + x[idx] = self.value - def set_dtype(self): - self.dtype = "float32" + def _get_answer(self): + self.data[np.array([[False, True, False], [True, True, False] + ])] = self.value - cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp32") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt +class TestSetValueItemBool6(TestSetValueApi): + def _call_setitem(self, x): + x[0, ...] = 0 + x[x > 0] = self.value -create_test_value_numpy_fp32(TestSetValueItemInt) -create_test_value_numpy_fp32(TestSetValueItemSlice) -create_test_value_numpy_fp32(TestSetValueItemSlice2) -create_test_value_numpy_fp32(TestSetValueItemSlice3) -create_test_value_numpy_fp32(TestSetValueItemSlice4) + def _get_answer(self): + self.data[0, ...] = 0 + self.data[self.data > 0] = self.value -def create_test_value_numpy_fp64(parent): +def create_test_value_int32(parent): class TestValueInt(parent): def set_value(self): - self.value = np.array([2**127]).astype("float64") - - def set_dtype(self): - self.dtype = "float64" - - cls_name = "{0}_{1}".format(parent.__name__, "ValueNumpyFp64") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt - - -create_test_value_numpy_fp64(TestSetValueItemInt) -create_test_value_numpy_fp64(TestSetValueItemSlice) -create_test_value_numpy_fp64(TestSetValueItemSlice2) -create_test_value_numpy_fp64(TestSetValueItemSlice3) -create_test_value_numpy_fp64(TestSetValueItemSlice4) - + self.value = 7 -# 2.3 value is a Paddle Tensor (int32, int64, float32, float64, bool) -def create_test_value_tensor_int32(parent): - class TestValueInt(parent): def set_dtype(self): self.dtype = "int32" - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt32") + cls_name = "{0}_{1}".format(parent.__name__, "ValueInt32") TestValueInt.__name__ = cls_name globals()[cls_name] = TestValueInt -create_test_value_tensor_int32(TestSetValueItemInt) -create_test_value_tensor_int32(TestSetValueItemSlice) -create_test_value_tensor_int32(TestSetValueItemSlice2) -create_test_value_tensor_int32(TestSetValueItemSlice3) -create_test_value_tensor_int32(TestSetValueItemSlice4) +create_test_value_int32(TestSetValueItemInt) +create_test_value_int32(TestSetValueItemSlice) +create_test_value_int32(TestSetValueItemSlice2) +create_test_value_int32(TestSetValueItemSlice3) +create_test_value_int32(TestSetValueItemSlice4) -def create_test_value_tensor_int64(parent): +def create_test_value_int64(parent): class TestValueInt(parent): + def set_value(self): + self.value = 7 + def set_dtype(self): self.dtype = "int64" - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorInt64") + cls_name = "{0}_{1}".format(parent.__name__, "ValueInt64") TestValueInt.__name__ = cls_name globals()[cls_name] = TestValueInt -create_test_value_tensor_int64(TestSetValueItemInt) -create_test_value_tensor_int64(TestSetValueItemSlice) -create_test_value_tensor_int64(TestSetValueItemSlice2) -create_test_value_tensor_int64(TestSetValueItemSlice3) -create_test_value_tensor_int64(TestSetValueItemSlice4) +create_test_value_int64(TestSetValueItemInt) +create_test_value_int64(TestSetValueItemSlice) +create_test_value_int64(TestSetValueItemSlice2) +create_test_value_int64(TestSetValueItemSlice3) +create_test_value_int64(TestSetValueItemSlice4) def create_test_value_tensor_fp32(parent): @@ -503,30 +533,6 @@ def _get_answer(self): create_test_value_tensor_fp32(TestSetValueItemSlice4) -def create_test_value_tensor_fp64(parent): - class TestValueInt(parent): - def set_dtype(self): - self.dtype = "float64" - - def _call_setitem(self, x): - value = paddle.full(shape=[1], fill_value=3, dtype=self.dtype) - x[0, 1] = value - - def _get_answer(self): - self.data[0, 1] = 3 - - cls_name = "{0}_{1}".format(parent.__name__, "ValueTensorFp64") - TestValueInt.__name__ = cls_name - globals()[cls_name] = TestValueInt - - -create_test_value_tensor_fp64(TestSetValueItemInt) -create_test_value_tensor_fp64(TestSetValueItemSlice) -create_test_value_tensor_fp64(TestSetValueItemSlice2) -create_test_value_tensor_fp64(TestSetValueItemSlice3) -create_test_value_tensor_fp64(TestSetValueItemSlice4) - - # 3. Test different shape of value class TestSetValueValueShape1(TestSetValueApi): def set_value(self): @@ -589,59 +595,5 @@ def _get_answer(self): self.data[:, 0] = self.value -# 4. Test error -class TestError(TestSetValueBase): - def _value_type_error(self): - with self.assertRaisesRegexp( - TypeError, - "Only support to assign an integer, float, numpy.ndarray or paddle.Tensor" - ): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - value = [1] - x[0] = value - - def _dtype_error(self): - with self.assertRaisesRegexp( - TypeError, - "When assign a numpy.ndarray, integer or float to a paddle.Tensor, " - ): - y = paddle.ones(shape=self.shape, dtype="float16") - y[0] = 1 - - def _step_error(self): - with self.assertRaisesRegexp(ValueError, "step can not be 0"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - x[0:1:0] = self.value - - def _ellipsis_error(self): - with self.assertRaisesRegexp( - IndexError, "An index can only have a single ellipsis"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - x[..., ...] = self.value - with self.assertRaisesRegexp(ValueError, "the start or end is None"): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - one = paddle.ones([1]) - x[::one] = self.value - - def _broadcast_mismatch(self): - program = paddle.static.Program() - with paddle.static.program_guard(program): - x = paddle.ones(shape=self.shape, dtype=self.dtype) - value = np.array([3, 4, 5, 6, 7]) - x[0] = value - exe = paddle.static.Executor(paddle.CPUPlace()) - with self.assertRaises(ValueError): - exe.run(program) - - def test_error(self): - self.set_input() - paddle.enable_static() - with paddle.static.program_guard(self.program): - self._value_type_error() - self._dtype_error() - self._step_error() - self._broadcast_mismatch() - - if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py index 5a38f14868bb8..611691109e187 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_slice_op_npu.py @@ -301,5 +301,295 @@ def test_npu(self): self.assertTrue(np.allclose(npu_loss, cpu_loss)) +class TestSliceOpDecsDim(OpTest): + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.set_inputs() + self.set_outputs() + self.set_attrs() + + def set_inputs(self): + self.inputs = {'Input': self.input} + + def set_outputs(self): + self.outputs = {'Out': self.out} + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + def init_dtype(self): + self.dtype = np.float32 + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad_normal(self): + if self.dtype == np.float16: + return + self.check_grad_with_place(self.place, ['Input'], 'Out') + + +class TestSliceOpDecsDimFp16(TestSliceOpDecsDim): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDim2(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceOpDecsDim3(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1, 0, 2] + self.ends = [1000000, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [1, 1, 1] + self.out = self.input[-1, 0, 2:4, :] + + +class TestSliceOpDecsDim4(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 7]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSliceOpDecsDim5(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [1, 1, 1] + self.out = self.input[:, :, :, -1] + + +class TestSliceOpDecsDim6(TestSliceOpDecsDim): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [0, 1, 2, 3] + self.ends = [1, 2, 3, 4] + self.axes = [0, 1, 2, 3] + self.decrease_axis = [0, 1, 2, 3] + self.infer_flags = [1, 1, 1] + self.out = self.input[0, 1, 2, 3:4] + + +class TestSliceOpDecsDimStartsTensor(TestSliceOpDecsDim): + def set_inputs(self): + self.inputs = { + 'Input': self.input, + "StartsTensor": np.array( + self.starts, dtype='int32') + } + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + #'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1, 0:3, 2:4, :] + + +class TestSliceOpDecsDimStartsTensorFP16(TestSliceOpDecsDimStartsTensor): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDimStartsTensorStartsAndEndsTensor(TestSliceOpDecsDim): + def set_inputs(self): + self.inputs = { + 'Input': self.input, + "StartsTensor": np.array( + self.starts, dtype='int64'), + "EndsTensor": np.array( + self.ends, dtype='int32') + } + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + #'starts': self.starts, + #'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 1, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0, 1] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1, 0, 2:4, :] + + +class TestSliceOpDecsDimStartsTensorStartsAndEndsTensorFP16( + TestSliceOpDecsDimStartsTensorStartsAndEndsTensor): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpDecsDimStartsListTensor(TestSliceOpDecsDim): + def set_inputs(self): + starts_tensor = [] + for index, ele in enumerate(self.starts): + starts_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor} + + def set_attrs(self): + self.attrs = { + 'axes': self.axes, + 'starts': self.starts_infer, + 'ends': self.ends, + 'infer_flags': self.infer_flags, + 'decrease_axis': self.decrease_axis, + } + + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [2, 3, 4] + self.axes = [0, 1, 2] + self.decrease_axis = [0] + self.infer_flags = [1, -1, 1] + self.out = self.input[1, 0:3, 2:4, :] + + self.starts_infer = [1, -1, 2] + + +class TestSliceOpDecsDimStartsListTensor2(TestSliceOpDecsDimStartsListTensor): + def config(self): + self.input = np.random.random([3, 4, 5, 6]).astype(self.dtype) + self.starts = [-1] + self.ends = [1000000] + self.axes = [3] + self.decrease_axis = [3] + self.infer_flags = [-1] + self.out = self.input[:, :, :, -1] + + self.starts_infer = [-1] + + +class TestSliceOpDecsDimStartsListTensorFP16( + TestSliceOpDecsDimStartsListTensor): + def init_dtype(self): + self.dtype = np.float16 + + +class TestSliceOpInt64(OpTest): + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.inputs = {'Input': self.input} + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': self.starts, + 'ends': self.ends, + 'infer_flags': self.infer_flags + } + + def config(self): + self.input = np.random.randint( + 100, size=(3, 4, 5, 6)).astype(self.dtype) + self.starts = [1, 0, 2] + self.ends = [3, 3, 4] + self.axes = [0, 1, 2] + self.infer_flags = [1, 1, 1] + self.out = self.input[1:3, 0:3, 2:4, :] + + def init_dtype(self): + self.dtype = np.int64 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestSliceOpTensorInt64(TestSliceOpInt64): + def setUp(self): + self.op_type = "slice" + self.set_npu() + self.init_dtype() + self.config() + self.inputs = { + 'Input': self.input, + 'StartsTensor': self.starts, + 'EndsTensor': self.ends + } + self.outputs = {'Out': self.out} + self.attrs = { + 'axes': self.axes, + 'starts': [-1, -1, -1], + 'ends': [-1, -1, -1], + 'infer_flags': self.infer_flags + } + + def config(self): + self.input = np.random.randint( + 100, size=(3, 4, 5, 6)).astype(self.dtype) + self.starts = np.array([1, 0, 2]).astype('int32') + self.ends = np.array([3, 3, 4]).astype('int32') + self.axes = [0, 1, 2] + self.infer_flags = [-1, -1, -1] + self.out = self.input[1:3, 0:3, 2:4, :] + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py index b735adf76d6c1..c8a620d9dbb35 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_op_npu.py @@ -22,6 +22,7 @@ import paddle import paddle.fluid as fluid from paddle.fluid import core +from test_top_k_v2_op_npu import numpy_topk paddle.enable_static() SEED = 2021 @@ -87,5 +88,40 @@ def test_check_output(self): self.check_output_with_place(self.place) +class TestTopkV3(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "top_k" + + self.init_dtype() + self.set_input_data() + self.set_attrs() + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=True) + + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis} + self.outputs = {'Out': output, 'Indices': indices} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_attrs(self): + self.k = 3 + self.axis = 1 + + def set_input_data(self): + self.input_data = np.random.choice( + 10000, size=(10, 20), replace=False).astype(self.dtype) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py index e95f3cc83cfb3..b1a6bfcdaaadc 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_transpose_op_npu.py @@ -31,40 +31,104 @@ def setUp(self): self.op_type = "transpose2" self.place = paddle.NPUPlace(0) self.init_dtype() - self.init_input_output() - self.init_kernel_type() - self.init_axis() + self.init_shape_axis() - self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} - self.attrs = {'axis': [0, 2, 1, 3], 'data_format': 'AnyLayout'} - self.outputs = {'Out': self.out} + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.attrs = {'axis': self.axis, 'data_format': 'AnyLayout'} + self.outputs = {'Out': self.inputs['X'].transpose(self.axis)} def set_npu(self): self.__class__.use_npu = True - def init_kernel_type(self): - self.use_mkldnn = False - - def init_input_output(self): - self.x = np.random.uniform(0.1, 1, [8, 512, 12, 64]).astype(self.dtype) - self.out = np.transpose(self.x, [0, 2, 1, 3]) - def init_dtype(self): self.dtype = np.float32 - def init_axis(self): - self.axis = -1 + def init_shape_axis(self): + self.shape = (3, 40) + self.axis = (1, 0) def test_check_output(self): self.check_output_with_place(self.place) + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestCase0(TestTransposeOp): + def init_shape_axis(self): + self.shape = (100, ) + self.axis = (0, ) + + +class TestCase1(TestTransposeOp): + def init_shape_axis(self): + self.shape = (3, 4, 10) + self.axis = (0, 2, 1) + + +class TestCase2(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5) + self.axis = (0, 2, 3, 1) + + +class TestCase3(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5, 6) + self.axis = (4, 2, 3, 1, 0) + + +class TestCase4(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 4, 5, 6, 1) + self.axis = (4, 2, 3, 1, 0, 5) + + +class TestCase5(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 16, 96) + self.axis = (0, 2, 1) -class TestTransposeOpFP16(TestTransposeOp): - no_need_check_grad = True +class TestCase6(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 10, 12, 16) + self.axis = (3, 1, 2, 0) + + +class TestCase7(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 10, 2, 16) + self.axis = (0, 1, 3, 2) + + +class TestCase8(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (0, 1, 3, 2, 4, 5, 6, 7) + + +class TestCase9(TestTransposeOp): + def init_shape_axis(self): + self.shape = (2, 3, 2, 3, 2, 4, 3, 3) + self.axis = (6, 1, 3, 5, 0, 2, 4, 7) + + +class TestTransposeOpFP16(TestTransposeOp): def init_dtype(self): self.dtype = np.float16 + def test_check_grad(self): + pass + + +class TestTransposeOpInt64(TestTransposeOp): + def init_dtype(self): + self.dtype = np.int64 + + def test_check_grad(self): + pass + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index a50a667f663ee..41fd0b442fe1c 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -134,6 +134,10 @@ def product(dim): delta = np.array(delta).astype(np.float16) elif tensor_to_check_dtype == core.VarDesc.VarType.BF16: tensor_to_check_dtype = np.float32 + elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX64: + tensor_to_check_dtype = np.complex64 + elif tensor_to_check_dtype == core.VarDesc.VarType.COMPLEX128: + tensor_tp_check_dtype = np.complex128 else: raise ValueError("Not supported data type " + str( tensor_to_check_dtype)) @@ -143,6 +147,9 @@ def get_output(): op.run(scope, place) for output_name in output_names: output_numpy = np.array(scope.find_var(output_name).get_tensor()) + # numpy.dtype does not have bfloat16, thus we use numpy.uint16 to + # store bfloat16 data, and need to be converted to float to check + # the floating precision. if tensor_to_check._dtype() == core.VarDesc.VarType.BF16: output_numpy = convert_uint16_to_float(output_numpy) sum.append(output_numpy.astype(tensor_to_check_dtype).mean()) @@ -358,11 +365,26 @@ def try_call_once(self, data_type): self.dtype = data_type def is_bfloat16_op(self): + # self.dtype is the dtype of inputs, and is set in infer_dtype_from_inputs_outputs. + # Make sure this function is called after calling infer_dtype_from_inputs_outputs. return self.dtype == np.uint16 or ( - hasattr(self, 'mkldnn_data_type') and - getattr(self, 'mkldnn_data_type') is "bfloat16") or ( - hasattr(self, 'attrs') and 'mkldnn_data_type' in self.attrs and - self.attrs['mkldnn_data_type'] == 'bfloat16') + hasattr(self, 'output_dtype') and + self.output_dtype == np.uint16) or ( + hasattr(self, 'mkldnn_data_type') and + getattr(self, 'mkldnn_data_type') is "bfloat16") or ( + hasattr(self, 'attrs') and + 'mkldnn_data_type' in self.attrs and + self.attrs['mkldnn_data_type'] == 'bfloat16') + + def is_mkldnn_op(self): + return (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or ( + hasattr(self, "attrs") and "use_mkldnn" in self.attrs and + self.attrs["use_mkldnn"] == True) + + def is_xpu_op(self): + return (hasattr(self, "use_xpu") and self.use_xpu == True) or ( + hasattr(self, "attrs") and "use_xpu" in self.attrs and + self.attrs["use_xpu"] == True) def infer_dtype_from_inputs_outputs(self, inputs, outputs): def is_np_data(input): @@ -394,8 +416,8 @@ def infer_dtype(numpy_dict, dtype_set): # infer dtype from inputs, and dtype means the precision of the test # collect dtype of all inputs - dtype_set = set() - infer_dtype(inputs, dtype_set) + input_dtype_set = set() + infer_dtype(inputs, input_dtype_set) dtype_list = [ np.dtype(np.float64), np.dtype(np.float32), np.dtype(np.float16), np.dtype(np.int64), np.dtype(np.int32), np.dtype(np.uint16), @@ -404,12 +426,20 @@ def infer_dtype(numpy_dict, dtype_set): ] # check the dtype in dtype_list in order, select the first dtype that in dtype_set for dtype in dtype_list: - if dtype in dtype_set: + if dtype in input_dtype_set: self.dtype = dtype break - # save dtype in class attr + # save input dtype in class attr self.__class__.dtype = self.dtype + # infer dtype of outputs + output_dtype_set = set() + infer_dtype(outputs, output_dtype_set) + for dtype in dtype_list: + if dtype in output_dtype_set: + self.output_dtype = dtype + break + def feed_var(self, input_vars, place): feed_map = {} for var_name in input_vars: @@ -435,14 +465,10 @@ def feed_var(self, input_vars, place): def _append_ops(self, block): self.__class__.op_type = self.op_type # for ci check, please not delete it for now - if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \ - (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \ - self.attrs["use_mkldnn"] == True): + if self.is_mkldnn_op(): self.__class__.use_mkldnn = True - if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ - (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ - self.attrs["use_xpu"] == True): + if self.is_xpu_op(): self.__class__.use_xpu = True op_proto = OpProtoHolder.instance().get_op_proto(self.op_type) @@ -1088,12 +1114,15 @@ def check_output_with_place(self, atol = 0 if self.is_bfloat16_op(): - check_dygraph = False - if hasattr(self, 'force_fp32_output') and getattr( - self, 'force_fp32_output'): - atol = 1e-2 + if self.is_mkldnn_op(): + check_dygraph = False + if hasattr(self, 'force_fp32_output') and getattr( + self, 'force_fp32_output'): + atol = 1e-2 + else: + atol = 2 else: - atol = 2 + atol = 1e-2 if no_check_set is not None: if self.op_type not in no_check_set_white_list.no_check_set_white_list: @@ -1189,6 +1218,7 @@ def find_actual(target_name, fetch_list): expect = self.outputs[out_name] expect_t = expect[0] if isinstance(expect, tuple) else expect + # np.uint16 represents bfloat16 if actual_t.dtype == np.uint16 and expect_t.dtype in [ np.float32, np.float64 ]: @@ -1201,6 +1231,7 @@ def find_actual(target_name, fetch_list): expect_t = convert_uint16_to_float(expect_t) actual_t = convert_uint16_to_float(actual_t) atol = max(atol, 0.03) + # NOTE(zhiqiu): np.allclose([], [1.]) returns True # see details: https://stackoverflow.com/questions/38331703/why-does-numpys-broadcasting-sometimes-allow-comparing-arrays-of-different-leng if expect_t.size == 0: @@ -1210,13 +1241,19 @@ def find_actual(target_name, fetch_list): np.allclose( actual_t, expect_t, - rtol=rtol, atol=atol, + rtol=rtol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + "But Got" + str(actual_t) + " in class " + self.__class__.__name__) if check_dygraph: + if self.is_bfloat16_op(): + if imperative_actual_t.dtype == np.uint16: + imperative_actual_t = convert_uint16_to_float( + imperative_actual_t) + if expect_t.dtype == np.uint16: + expect_t = convert_uint16_to_float(expect_t) if six.moves.reduce( lambda x, y: x * y, imperative_actual_t.shape, 1) == 0 and six.moves.reduce( @@ -1228,6 +1265,7 @@ def find_actual(target_name, fetch_list): imperative_actual_t, expect_t, atol=atol, + rtol=rtol, equal_nan=equal_nan), "Output (" + out_name + ") has diff at " + str(place) + "\nExpect " + str(expect_t) + "\n" + @@ -1336,14 +1374,10 @@ def check_output(self, check_dygraph=True, inplace_atol=None): self.__class__.op_type = self.op_type - if (hasattr(self, "use_mkldnn") and self.use_mkldnn == True) or \ - (hasattr(self, "attrs") and "use_mkldnn" in self.attrs and \ - self.attrs["use_mkldnn"] == True): + if self.is_mkldnn_op(): self.__class__.use_mkldnn = True - if (hasattr(self, "use_xpu") and self.use_xpu == True) or \ - (hasattr(self, "attrs") and "use_xpu" in self.attrs and \ - self.attrs["use_xpu"] == True): + if self.is_xpu_op(): self.__class__.use_xpu = True places = self._get_places() @@ -1448,10 +1482,10 @@ def check_grad_with_place(self, op_outputs = self.outputs if hasattr(self, "outputs") else dict() op_attrs = self.attrs if hasattr(self, "attrs") else dict() - if self.is_bfloat16_op(): + self._check_grad_helper() + if self.is_bfloat16_op() and self.is_mkldnn_op(): check_dygraph = False - self._check_grad_helper() if self.dtype == np.float64 and \ self.op_type not in op_threshold_white_list.NEED_FIX_FP64_CHECK_GRAD_THRESHOLD_OP_LIST: numeric_grad_delta = 1e-5 diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py index 2a5dc76c6bb28..0a60f4cba09bc 100644 --- a/python/paddle/fluid/tests/unittests/test_adamw_op.py +++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py @@ -14,9 +14,153 @@ import unittest import paddle +import random import numpy as np import paddle.fluid as fluid +from op_test import OpTest from functools import partial +from paddle.framework import core + + +def adamw_step(inputs, attributes): + param = inputs['Param'] + grad = inputs['Grad'] + moment1 = inputs['Moment1'] + moment2 = inputs['Moment2'] + lr = inputs['LearningRate'] + beta1_pow = inputs['Beta1Pow'] + beta2_pow = inputs['Beta2Pow'] + + epsilon = attributes['epsilon'] + + if 'lr_ratio' in attributes: + lr = lr * attributes['lr_ratio'] + + if attributes["with_decay"]: + coeff = attributes["coeff"] + decay = 1.0 - lr * coeff + param2 = param * decay + param = param2.copy() + + if 'beta1' in attributes: + beta1 = attributes['beta1'] + else: + beta1 = inputs['Beta1Tensor'][0] + if 'beta2' in attributes: + beta2 = attributes['beta2'] + else: + beta2 = inputs['Beta2Tensor'][0] + + moment1_out = beta1 * moment1 + (1 - beta1) * grad + moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad) + lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) + param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon)) + return param_out, moment1_out, moment2_out + + +class TestAdamW(OpTest): + def setUp(self): + '''Test AdamW Op with supplied attributes + ''' + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32") + # The second moment is positive + moment2 = np.random.random((102, 105)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "coeff": 0.5, + "with_decay": True + } + + param_out, moment1_out, \ + moment2_out = adamw_step(self.inputs, self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def test_check_output(self): + self.check_output() + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestAdamW2(OpTest): + def setUp(self): + '''Test AdamW Op with supplied attributes + ''' + self.op_type = "adamw" + param = np.random.uniform(-1, 1, (2, 2)).astype("float32") + grad = np.random.uniform(-1, 1, (2, 2)).astype("float32") + moment1 = np.random.uniform(-1, 1, (2, 2)).astype("float32") + # The second moment is positive + moment2 = np.random.random((2, 2)).astype("float32") + + learning_rate = 0.004 + beta1 = 0.78 + beta2 = 0.836 + epsilon = 1e-4 + beta1_pow = beta1**10 + beta2_pow = beta2**10 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'Moment1': moment1, + 'Moment2': moment2, + 'LearningRate': np.array([learning_rate]).astype("float32"), + 'Beta1Pow': np.array([beta1_pow]).astype("float32"), + 'Beta2Pow': np.array([beta2_pow]).astype("float32") + } + + self.attrs = { + 'epsilon': epsilon, + 'beta1': beta1, + 'beta2': beta2, + "lr_ratio": 0.1, + "coeff": 0.5, + "with_decay": True + } + + param_out, moment1_out, moment2_out = adamw_step(self.inputs, + self.attrs) + + self.outputs = { + 'Moment1Out': moment1_out, + 'Moment2Out': moment2_out, + 'ParamOut': param_out, + 'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1, + 'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2 + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0)) class TestAdamWOp(unittest.TestCase): @@ -160,7 +304,14 @@ def simple_lr_setting(param, decay_rate, n_layers): return decay_rate**(n_layers + 2 - depth) +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") class TestAdamWOpLayerwiseLR(TestAdamWOp): + def setUp(self): + random.seed(2021) + np.random.seed(2021) + paddle.seed(2021) + def test_adamw_op_dygraph(self): paddle.disable_static() value = np.arange(26).reshape(2, 13).astype("float32") @@ -181,17 +332,20 @@ def test_adamw_op_dygraph(self): weight_decay=0.01, lr_ratio=simple_lr_fun) - for _ in range(2): + loss_ref = np.array( + [4.8383293, 3.0854003, 1.33299, -0.418993, -2.171043]) + for i in range(5): a1 = linear1(a) out = linear2(a1) + out = paddle.mean(out) out.backward() adam.step() adam.clear_gradients() + np.testing.assert_allclose(out[0].numpy(), loss_ref[i], rtol=1e-6) def test_adamw_op(self): paddle.enable_static() - place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda() \ - else fluid.CPUPlace() + place = fluid.CUDAPlace(0) train_prog = fluid.Program() startup = fluid.Program() with fluid.program_guard(train_prog, startup): @@ -223,7 +377,10 @@ def test_adamw_op(self): exe = fluid.Executor(place) exe.run(startup) - for _ in range(2): + + loss_ref = np.array( + [0.36120513, 0.2720821, 0.67208904, 0.14607805, 0.24098626]) + for i in range(5): inputs = np.random.random(size=[8, 10]).astype('float32') outputs = np.random.random(size=[8, 1]).astype('float32') rets = exe.run(train_prog, @@ -231,6 +388,7 @@ def test_adamw_op(self): "y": outputs}, fetch_list=[avg_cost]) assert rets[0] is not None + np.testing.assert_allclose(rets[0], loss_ref[i], rtol=1e-6) paddle.disable_static() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py new file mode 100644 index 0000000000000..89e9b7e817f45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py @@ -0,0 +1,287 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0, 1]) +PP_MESH_0 = None +PP_MESH_1 = None + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + if _global_parallel_strategy == "pp": + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1]) + else: + auto.shard_tensor( + self.linear0.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + if _global_parallel_strategy == "pp": + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + elif _global_parallel_strategy == "dp": + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1]) + else: + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_backward_dist_attr(dist_context, dist_main_prog, op_need_check): + has_dist_attr = True + vars = dist_main_prog.global_block().vars + + op_dist_attr = dist_context.get_op_distributed_attr_for_program( + op_need_check) + if not op_dist_attr or not op_dist_attr.get_process_mesh(): + has_dist_attr = False + + for var_name in op_need_check.input_arg_names: + if not op_dist_attr.get_input_dims_mapping(var_name) or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh(): + has_dist_attr = False + break + + if has_dist_attr: + for var_name in op_need_check.output_arg_names: + if not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_dims_mapping() or \ + not dist_context.get_tensor_distributed_attr_for_program(vars[var_name]).get_process_mesh(): + has_dist_attr = False + break + + return has_dist_attr + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id == 0: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization(dist_startup_prog, rank_id): + if rank_id == 0: + need_check_params = [ + "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", + "linear_0.b_0" + ] + else: + need_check_params = ['linear_1.w_0', 'linear_1.b_0'] + + params = [] + for var_name, var in dist_startup_prog.global_block().vars.items(): + if var.is_parameter: + params.append(var_name) + + return params == need_check_params + + +def check_initialization_for_dp(dist_startup_prog): + need_check_params = [ + "layer_norm_0.b_0", "layer_norm_0.w_0", "linear_0.w_0", "linear_0.b_0" + ] + ['linear_1.w_0', 'linear_1.b_0'] + params = [] + for var_name, var in dist_startup_prog.global_block().vars.items(): + if var.is_parameter: + params.append(var_name) + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + + return params == need_check_params == broadcast_varnames + + +class TestMLPReshard(unittest.TestCase): + def test_complete_backward_annotation(self): + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, 0) + complete_backward_annotation(dist_main_prog, dist_context) + + op_need_check = None + for op in dist_main_prog.global_block().ops: + if op.type == "gelu_grad": + op_need_check = op + break + + # grad op should have dist attr + self.assertTrue( + check_backward_dist_attr(dist_context, dist_main_prog, + op_need_check)) + + def test_mlp_pp(self): + global _global_parallel_strategy + _global_parallel_strategy = "pp" + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + global PP_MESH_0 + PP_MESH_0 = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH) + global PP_MESH_1 + PP_MESH_1 = auto.ProcessMesh(mesh=[1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 1 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # parameter initialization of every rank should be different in the pipeline scene + self.assertTrue(check_initialization(dist_startup_prog, rank_id)) + + def test_mlp_dp(self): + global _global_parallel_strategy + _global_parallel_strategy = "dp" + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + # send and recv should not exist in dp scene. + self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) + + # all parameters should be initialized in dp scene + self.assertTrue(check_initialization_for_dp(dist_startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py new file mode 100644 index 0000000000000..1e134eebfd23b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py @@ -0,0 +1,173 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = "dp_mp_pp" +ROOT_MESH = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]]) +_global_process_mesh = auto.ProcessMesh( + [[[0, 1], [4, 5]], [[2, 3], [6, 7]]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], parent=ROOT_MESH) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 1]) + auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[0, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id in [0, 1, 4, 5]: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization_for_dpmppp(dist_startup_prog): + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + result = len(broadcast_varnames) > 0 + return result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_dpmppp(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 2 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + print(dist_main_prog) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + print(dist_main_prog) + print(dist_startup_prog) + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # check parameter initialization + self.assertTrue(check_initialization_for_dpmppp(dist_startup_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py new file mode 100644 index 0000000000000..5a10a21834570 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py @@ -0,0 +1,231 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import DistributedContext +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.completion import complete_backward_annotation +from paddle.distributed.auto_parallel.reshard import reshard + +paddle.enable_static() +_global_parallel_strategy = "mp_pp" +ROOT_MESH = auto.ProcessMesh([[0, 1], [2, 3]]) +_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], parent=ROOT_MESH) +PP_MESH_0 = auto.ProcessMesh([0, 1], parent=ROOT_MESH) +PP_MESH_1 = auto.ProcessMesh([2, 3], parent=ROOT_MESH) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.word_embeddings = nn.Embedding( + hidden_size, + hidden_size, + weight_attr=paddle.ParamAttr( + name="word_embeddings", + initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range))) + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.linear2 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + + def forward(self, input): + auto.shard_tensor( + self.word_embeddings.weight, PP_MESH_0, dim_mapping=[0, -1]) + auto.shard_tensor(self.linear0.weight, PP_MESH_0, dim_mapping=[-1, 0]) + auto.shard_tensor(self.linear1.weight, PP_MESH_1, dim_mapping=[0, -1]) + auto.shard_tensor(self.linear2.weight, PP_MESH_1, dim_mapping=[0, -1]) + w_out = self.word_embeddings(input) + out = self.linear0(w_out) + gelu_out = F.gelu(out, approximate=True) + out = self.linear1(gelu_out) + out1 = self.linear2(gelu_out) + out = out + out1 + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data(name="input", shape=[batch_size], dtype='int32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog(train_program, startup_program, dist_context, rank_id): + global _global_process_mesh + dist_context.set_process_mesh(_global_process_mesh) + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + # auto completion + complete_train_program = auto.complete_annotation(train_program, + dist_context) + + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + # logical partition + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + dist_params_grads = partitioner.apply_backward( + loss, complete_train_program, startup_program, auto_parallel_main_prog, + auto_parallel_startup_prog) + optimizer = paddle.fluid.optimizer.AdamOptimizer() + opt_ops = partitioner.apply_optimize(optimizer, dist_params_grads, + auto_parallel_main_prog, + auto_parallel_startup_prog) + return auto_parallel_main_prog, auto_parallel_startup_prog + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id in [0, 1]: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names[ + 0]: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +def check_initialization_for_mppp(dist_startup_prog, rank_id): + if rank_id in [0, 1]: + need_check_params = [] + else: + need_check_params = ["linear_1.b_0", "linear_2.b_0"] + broadcast_varnames = [] + for op in dist_startup_prog.global_block().ops: + if op.type == "c_broadcast": + broadcast_varnames.append(op.output_arg_names[0]) + + return need_check_params == broadcast_varnames + + +def check_allgather(dist_main_program): + allgather_out = "x@RESHARD_0" + var_result = False + op_result = False + vars = dist_main_program.global_block().vars + if allgather_out in vars and vars[allgather_out].shape == (4, 4): + var_result = True + for op in dist_main_program.global_block().ops: + if op.type == "matmul_v2": + if allgather_out in op.input_arg_names: + op_result = True + return var_result and op_result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_mppp(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = DistributedContext() + rank_id = 2 + dist_main_prog, dist_startup_prog = get_dist_prog( + train_program, startup_program, dist_context, rank_id) + complete_backward_annotation(dist_main_prog, dist_context) + reshard(dist_main_prog, dist_startup_prog, rank_id, dist_context) + + # check send and recv result + self.assertTrue(check_send_recv_result(dist_main_prog, rank_id)) + + # parameter which not been sliced should be the same in the mp scene + self.assertTrue( + check_initialization_for_mppp(dist_startup_prog, rank_id)) + + def test_allgather(self): + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + process_mesh = auto.ProcessMesh(mesh=[0, 3], parent=ROOT_MESH) + with static.program_guard(train_program, startup_program): + x = paddle.static.data(name="x", shape=[4, 4], dtype='float32') + x = auto.shard_tensor(x, process_mesh, dim_mapping=[0, -1]) + + w = paddle.static.data(name="w", shape=[4, 4], dtype='float32') + w = auto.shard_tensor(w, process_mesh, dim_mapping=[-1, -1]) + + y = paddle.distributed.shard_op(paddle.matmul, process_mesh, { + x.name: [-1, -1], + w.name: [-1, -1] + }, **{"x": x, + "y": w})[0] + + rank_id = 0 + dist_context = DistributedContext() + dist_strategy = fleet.DistributedStrategy() + partitioner = Partitioner(dist_strategy, dist_context, rank_id) + complete_train_program = auto.complete_annotation(train_program, + dist_context) + auto_parallel_main_prog, auto_parallel_startup_prog = partitioner.transpile_forward( + complete_train_program, startup_program) + reshard(auto_parallel_main_prog, startup_program, rank_id, dist_context) + # the x should not be slice + self.assertTrue(check_allgather(auto_parallel_main_prog)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py new file mode 100644 index 0000000000000..bf2ba9f061fd8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py @@ -0,0 +1,184 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import os +if os.getenv("CUDA_VISIBLE_DEVICES", None) is None: + os.environ["CUDA_VISIBLE_DEVICES"] = '0' + +import paddle +import paddle.nn as nn +import paddle.static as static +import paddle.nn.functional as F +import paddle.utils as utils +import paddle.distributed.auto_parallel as auto +from paddle.distributed.auto_parallel.context import get_default_distributed_context +from paddle.distributed import fleet +from paddle.distributed.auto_parallel.partitioner import Partitioner +from paddle.distributed.auto_parallel.reshard import reshard +from paddle.distributed.auto_parallel.process import new_process_group + +paddle.enable_static() +_global_parallel_strategy = None +_global_process_mesh = None +ROOT_MESH = auto.ProcessMesh([0]) + + +class MLPLayer(nn.Layer): + def __init__(self, + hidden_size=1024, + intermediate_size=4 * 1024, + initializer_range=0.02): + super(MLPLayer, self).__init__() + d_model = hidden_size + dim_feedforward = intermediate_size + weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal( + mean=0.0, std=initializer_range)) + bias_attr = None + + self.linear0 = nn.Linear( + d_model, dim_feedforward, weight_attr, bias_attr=bias_attr) + self.linear1 = nn.Linear( + dim_feedforward, d_model, weight_attr, bias_attr=bias_attr) + self.norm = nn.LayerNorm(d_model, epsilon=1e-5) + + def forward(self, input): + if _global_parallel_strategy == "pp": + auto.shard_tensor( + self.linear0.weight, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, PP_MESH_1, dim_mapping=[-1, -1]) + else: + auto.shard_tensor( + self.linear0.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + auto.shard_tensor( + self.linear1.weight, _global_process_mesh, + dim_mapping=[-1, -1]) + + out = self.norm(input) + out = self.linear0(out) + out = F.gelu(out, approximate=True) + out = self.linear1(out) + + return out + + +def mlp_forward(train_program, start_program): + with static.program_guard(train_program, + start_program), utils.unique_name.guard(): + batch_size = 4 + hidden_size = 1024 + sequence_len = 512 + input = static.data( + name="input", shape=[batch_size, hidden_size], dtype='float32') + label = static.data( + name="label", shape=[batch_size, 1], dtype='float32') + + if _global_parallel_strategy == "pp": + auto.shard_tensor(input, PP_MESH_0, dim_mapping=[-1, -1]) + auto.shard_tensor(label, PP_MESH_1, dim_mapping=[-1, -1]) + elif _global_parallel_strategy == "dp": + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[0, -1]) + else: + auto.shard_tensor(input, _global_process_mesh, dim_mapping=[-1, -1]) + + mlp = MLPLayer( + hidden_size=hidden_size, + intermediate_size=4 * hidden_size, + initializer_range=0.02) + + predict = mlp(input) + error_cost = paddle.nn.functional.square_error_cost(predict, label) + loss = paddle.mean(error_cost) + + return loss, train_program, start_program + + +def get_dist_prog_with_parallelizer(train_program, startup_program, + dist_context): + global _global_process_mesh + + dist_strategy = fleet.DistributedStrategy() + dist_strategy.amp = False + dist_strategy.pipeline = False + dist_strategy.recompute = False + + # init parallel optimizer + dist_strategy.semi_auto = True + fleet.init(is_collective=True, strategy=dist_strategy) + + loss, train_program, startup_program = mlp_forward(train_program, + startup_program) + + optimizer = paddle.fluid.optimizer.AdamOptimizer( + learning_rate=0.00001, + beta1=0.9, + beta2=0.999, + epsilon=1e-08, + grad_clip=None) + optimizer = fleet.distributed_optimizer(optimizer) + + # fake a comm group + pg = new_process_group([3, 4]) + _, _, distributed_startup_program, distributed_main_program = optimizer.minimize( + loss, startup_program) + + return distributed_main_program, distributed_startup_program + + +def check_send_recv_result(dist_main_prog, rank_id): + send_result = False + recv_result = False + ops = dist_main_prog.global_block().ops + if rank_id == 0: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0@GRAD" in op.output_arg_names[ + 0]: + recv_result = True + else: + for idx, op in enumerate(ops): + if op.type == "send_v2" and "gelu_0.tmp_0@GRAD" in op.input_arg_names: + send_result = True + if op.type == "recv_v2" and "gelu_0.tmp_0" in op.output_arg_names[ + 0]: + recv_result = True + + return send_result and recv_result + + +class TestMLPReshard(unittest.TestCase): + def test_mlp_serial(self): + global _global_parallel_strategy + _global_parallel_strategy = None + global _global_process_mesh + _global_process_mesh = auto.ProcessMesh(mesh=[0], parent=ROOT_MESH) + + train_program = paddle.static.Program() + startup_program = paddle.static.Program() + dist_context = get_default_distributed_context() + rank_id = 0 + dist_main_prog, dist_startup_prog = get_dist_prog_with_parallelizer( + train_program, startup_program, dist_context) + # send and recv should not exist in serial scene. + self.assertFalse(check_send_recv_result(dist_main_prog, rank_id)) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index 9eaa69ce64428..cce13a8bf3b74 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -36,6 +36,11 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) else: x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1])) + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) if data_format == "NCHW": n, c, h, w = x.shape @@ -55,13 +60,19 @@ def _reference_testing(x, scale, offset, mean, var, epsilon, data_format): else: raise ValueError("Unknown data order.") - if len(x_shape) == 2: + if len(x_shape) == 2 or len(x_shape) == 3: y = np.reshape(y, x_shape) return y def _cal_mean_variance(x, epsilon, data_format): assert data_format in ['NCHW', 'NHWC'] + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) x_square = x * x axis = (0, 2, 3) if data_format == 'NCHW' else (0, 1, 2) C = x.shape[1] if data_format == 'NCHW' else x.shape[-1] @@ -76,6 +87,12 @@ def _cal_mean_variance(x, epsilon, data_format): def _reference_training(x, scale, offset, epsilon, data_format): x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": n, c, h, w = x.shape x_square = x * x @@ -94,7 +111,6 @@ def _reference_training(x, scale, offset, epsilon, data_format): offset_tile = np.reshape(offset, (1, c, 1, 1)) offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) y = normalized * scale_tile + offset_tile - return y, mean, var elif data_format == "NHWC": x_square = x * x x_square_sum = np.sum(x_square, (0, 1, 2)) @@ -104,10 +120,13 @@ def _reference_training(x, scale, offset, epsilon, data_format): var = x_square_sum / element_count - mean * mean normalized = (x - mean) / np.sqrt(var + epsilon) y = normalized * scale + offset - return y, mean, var else: raise ValueError("Unknown data order.") + if len(x_shape) == 3: + y = np.reshape(y, x_shape) + return y, mean, var + def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): # Use the following formulas to calculate gradients: @@ -124,6 +143,15 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): if data_format != "NCHW" and data_format != "NHWC": raise ValueError("Unknown data order.") + x_shape = x.shape + if len(x_shape) == 3: + if data_format == "NCHW": # NCL -> NCL1 + x = np.reshape(x, (x_shape[0], x_shape[1], x_shape[2], 1)) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], x_shape[2], 1)) + else: # NLC -> NL1C + x = np.reshape(x, (x_shape[0], x_shape[1], 1, x_shape[2])) + y_grad = np.reshape(y_grad, (x_shape[0], x_shape[1], 1, x_shape[2])) + if data_format == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) y_grad = np.transpose(y_grad, (0, 2, 3, 1)) @@ -142,6 +170,9 @@ def _reference_grad(x, y_grad, scale, mean, var, epsilon, data_format): x = np.transpose(x, (0, 3, 1, 2)) y_grad = np.transpose(y_grad, (0, 3, 1, 2)) + if len(x_shape) == 3: + x_grad = np.reshape(x_grad, x_shape) + return x_grad, grad_scale, grad_offset diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py index 0fc3dccab4a64..948e344e4c158 100644 --- a/python/paddle/fluid/tests/unittests/test_cast_op.py +++ b/python/paddle/fluid/tests/unittests/test_cast_op.py @@ -14,7 +14,6 @@ from __future__ import print_function -import op_test import unittest import numpy as np @@ -22,9 +21,10 @@ import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard +from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16 -class TestCastOp1(op_test.OpTest): +class TestCastOpFp32ToFp64(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} @@ -42,7 +42,7 @@ def test_grad(self): self.check_grad(['X'], ['Out']) -class TestCastOp2(op_test.OpTest): +class TestCastOpFp16ToFp32(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float16')} @@ -57,7 +57,7 @@ def test_check_output(self): self.check_output(atol=1e-3) -class TestCastOp3(op_test.OpTest): +class TestCastOpFp32ToFp16(OpTest): def setUp(self): ipt = np.random.random(size=[10, 10]) self.inputs = {'X': ipt.astype('float32')} @@ -72,6 +72,36 @@ def test_check_output(self): self.check_output(atol=1e-3) +class TestCastOpBf16ToFp32(OpTest): + def setUp(self): + ipt = np.array(np.random.randint(10, size=[10, 10])).astype('uint16') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_uint16_to_float(ipt)} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.BF16), + 'out_dtype': int(core.VarDesc.VarType.FP32) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output() + + +class TestCastOpFp32ToBf16(OpTest): + def setUp(self): + ipt = np.random.random(size=[10, 10]).astype('float32') + self.inputs = {'X': ipt} + self.outputs = {'Out': convert_float_to_uint16(ipt)} + self.attrs = { + 'in_dtype': int(core.VarDesc.VarType.FP32), + 'out_dtype': int(core.VarDesc.VarType.BF16) + } + self.op_type = 'cast' + + def test_check_output(self): + self.check_output() + + class TestCastOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 10cd774ce04be..5f936e577a06f 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -16,7 +16,7 @@ import unittest import numpy as np -from op_test import OpTest, skip_check_grad_ci +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci import paddle.fluid as fluid from paddle.fluid import compiler, Program, program_guard, core import paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index db05801c7227b..8ea4e369d3236 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -20,7 +20,8 @@ import paddle import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest, convert_float_to_uint16, get_numeric_gradient +from paddle.fluid.tests.unittests.op_test import ( + OpTest, convert_float_to_uint16, get_numeric_gradient) from paddle.fluid.tests.unittests.testsuite import create_op from paddle.fluid import Program, program_guard diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py index 027c806fc02e9..89125dc326d15 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py @@ -22,7 +22,7 @@ paddle.enable_static() import paddle.fluid.core as core import paddle.fluid as fluid -from op_test import OpTest +from paddle.fluid.tests.unittests.op_test import OpTest def conv2dtranspose_forward_naive(input_, filter_, attrs): diff --git a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py index d2eae1cce5bcb..d3ed76e34a614 100644 --- a/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py +++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_loss.py @@ -1175,6 +1175,56 @@ def test_cross_entropy_loss_2d_with_weight_none(self): self.assertTrue(np.allclose(static_ret, expected)) self.assertTrue(np.allclose(dy_ret_value, expected)) + def test_cross_entropy_loss_2d_with_weight_axis_change_mean(self): + input_np = np.random.random(size=(2, 3, 2, 2)).astype(self.dtype) #NCHW + label_np = np.random.randint( + 0, 3, size=(2, 2, 2)).astype(np.int64) #NHW + weight_np = np.random.random(size=(3, )).astype(self.dtype) #C + + paddle.enable_static() + prog = fluid.Program() + startup_prog = fluid.Program() + place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.program_guard(prog, startup_prog): + input = fluid.data( + name='input', shape=[2, 3, 2, 2], dtype=self.dtype) + label = fluid.data(name='label', shape=[2, 2, 2], dtype='int64') + weight = fluid.data(name='weight', shape=[3], dtype=self.dtype) + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=weight, reduction='mean', axis=1) + # specify the class channels to axis 1 + ret = cross_entropy_loss(input, label) + + exe = fluid.Executor(place) + static_ret = exe.run(prog, + feed={ + 'input': input_np, + 'label': label_np, + "weight": weight_np + }, + fetch_list=[ret]) + + self.assertIsNotNone(static_ret) + with fluid.dygraph.guard(): + cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss( + weight=fluid.dygraph.to_variable(weight_np), + reduction='mean', + axis=1) + dy_ret = cross_entropy_loss( + fluid.dygraph.to_variable(input_np), + fluid.dygraph.to_variable(label_np)) + dy_ret_value = dy_ret.numpy() + self.assertIsNotNone(dy_ret_value) + expected = cross_entropy_loss_2d( + np.transpose(input_np, [0, 2, 3, 1]), + label_np, + weight=weight_np, + reduction='mean')[0] + self.assertTrue(np.allclose(static_ret, dy_ret_value)) + self.assertTrue(np.allclose(static_ret, expected)) + self.assertTrue(np.allclose(dy_ret_value, expected)) + def test_cross_entropy_loss_2d_with_weight_mean_ignore_exceedlabel(self): N = 4 C = 3 diff --git a/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py new file mode 100644 index 0000000000000..88f71f28412e3 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_device_name_capability.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest + + +class TestDeviceName(unittest.TestCase): + def test_device_name_default(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name() + self.assertIsNotNone(name) + + def test_device_name_int(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name(0) + self.assertIsNotNone(name) + + def test_device_name_CUDAPlace(self): + if paddle.is_compiled_with_cuda(): + name = paddle.device.cuda.get_device_name(paddle.CUDAPlace(0)) + self.assertIsNotNone(name) + + +class TestDeviceCapability(unittest.TestCase): + def test_device_capability_default(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability() + self.assertIsNotNone(capability) + + def test_device_capability_int(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability(0) + self.assertIsNotNone(capability) + + def test_device_capability_CUDAPlace(self): + if paddle.is_compiled_with_cuda(): + capability = paddle.device.cuda.get_device_capability( + paddle.CUDAPlace(0)) + self.assertIsNotNone(capability) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_cuda_graph.py b/python/paddle/fluid/tests/unittests/test_cuda_graph.py new file mode 100644 index 0000000000000..7d1317473531e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_cuda_graph.py @@ -0,0 +1,147 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import paddle.fluid as fluid +from paddle.device.cuda.graphs import CUDAGraph +import unittest +import numpy as np +from paddle.fluid.dygraph.base import switch_to_static_graph +from simple_nets import simple_fc_net_with_inputs + + +class TestCUDAGraph(unittest.TestCase): + def setUp(self): + if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm( + ): + fluid.set_flags({ + 'FLAGS_allocator_strategy': 'auto_growth', + 'FLAGS_sync_nccl_allreduce': False, + 'FLAGS_cudnn_deterministic': True + }) + + def random_tensor(self, shape): + return paddle.to_tensor( + np.random.randint( + low=0, high=10, size=shape).astype("float32")) + + @switch_to_static_graph + def test_cuda_graph_static_graph(self): + if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): + return + + seed = 100 + loss_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=True) + loss_no_cuda_graph = self.cuda_graph_static_graph_main( + seed, use_cuda_graph=False) + self.assertEqual(loss_cuda_graph, loss_no_cuda_graph) + + def cuda_graph_static_graph_main(self, seed, use_cuda_graph): + batch_size = 1 + class_num = 10 + image_shape = [batch_size, 784] + label_shape = [batch_size, 1] + + paddle.seed(seed) + np.random.seed(seed) + startup = paddle.static.Program() + main = paddle.static.Program() + with paddle.static.program_guard(main, startup): + image = paddle.static.data( + name="image", shape=image_shape, dtype='float32') + label = paddle.static.data( + name="label", shape=label_shape, dtype='int64') + image.persistable = True + label.persistable = True + loss = simple_fc_net_with_inputs(image, label, class_num) + loss.persistable = True + lr = paddle.optimizer.lr.PiecewiseDecay( + boundaries=[2, 3, 4], values=[0.01, 0.02, 0.03, 0.04]) + optimizer = paddle.optimizer.SGD(learning_rate=lr) + optimizer.minimize(loss) + place = paddle.CUDAPlace(0) + exe = paddle.static.Executor(place) + scope = paddle.static.Scope() + with paddle.static.scope_guard(scope): + exe.run(startup) + build_strategy = paddle.static.BuildStrategy() + build_strategy.allow_cuda_graph_capture = True + build_strategy.fix_op_run_order = True + build_strategy.fuse_all_optimizer_ops = True + compiled_program = paddle.static.CompiledProgram( + main).with_data_parallel( + loss_name=loss.name, + build_strategy=build_strategy, + places=place) + image_t = scope.var(image.name).get_tensor() + label_t = scope.var(label.name).get_tensor() + loss_t = scope.var(loss.name).get_tensor() + lr_var = main.global_block().var(lr._var_name) + self.assertTrue(lr_var.persistable) + lr_t = scope.var(lr_var.name).get_tensor() + cuda_graph = None + for batch_id in range(20): + image_t.set( + np.random.rand(*image_shape).astype('float32'), place) + label_t.set(np.random.randint( + low=0, high=class_num, size=label_shape, dtype='int64'), + place) + + if batch_id == 1 and use_cuda_graph: + cuda_graph = CUDAGraph(place, mode="global") + cuda_graph.capture_begin() + exe.run(compiled_program) + cuda_graph.capture_end() + + if cuda_graph: + lr_t.set(np.array([lr()], dtype='float32'), place) + cuda_graph.replay() + else: + exe.run(compiled_program) + lr.step() + if cuda_graph: + cuda_graph.reset() + return np.array(loss_t) + + def test_cuda_graph_dynamic_graph(self): + if not paddle.is_compiled_with_cuda() or paddle.is_compiled_with_rocm(): + return + + shape = [2, 3] + x = self.random_tensor(shape) + z = self.random_tensor(shape) + + g = CUDAGraph() + g.capture_begin() + y = x + 10 + z.add_(x) + g.capture_end() + + for _ in range(10): + z_np_init = z.numpy() + x_new = self.random_tensor(shape) + x.copy_(x_new, False) + g.replay() + x_np = x_new.numpy() + y_np = y.numpy() + z_np = z.numpy() + self.assertTrue((y_np - x_np == 10).all()) + self.assertTrue((z_np - z_np_init == x_np).all()) + + g.reset() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py index 34abc5b45531a..3b15b06b5efa8 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist_fleetapi.py @@ -32,7 +32,11 @@ def _setup_config(self): def test_dist_train(self): import paddle.fluid as fluid if fluid.core.is_compiled_with_cuda(): - self.check_with_place("dist_mnist.py", delta=1e-5) + self.check_with_place( + "dist_mnist.py", + delta=1e-5, + check_error_log=True, + need_envs={'FLAGS_allreduce_record_one_event': '1'}) class FleetCollectiveTest(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py index 89755d0365f2c..396d55b3d0a8b 100644 --- a/python/paddle/fluid/tests/unittests/test_dropout_op.py +++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py @@ -232,6 +232,75 @@ def init_test_case(self): self.fix_seed = False +class TestDropoutOpWithSeedOnCPUPlace(unittest.TestCase): + def test_seed_cpu_place(self): + paddle.enable_static() + main_program = Program() + with program_guard(main_program): + seed_input_name = "tensor@SeedInput" + x_var_name = "tensor@X" + x_out_var = "tensor@XOut" + + mask_var_name = "tensor@Mask" + seed_input_var = main_program.global_block().create_var( + name=seed_input_name, + shape=[1], + dtype='int32', + persistable=False, + stop_gradient=True) + x_out_var = main_program.global_block().create_var( + name=x_out_var, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + x_var = main_program.global_block().create_var( + name=x_var_name, + shape=[40, 40], + dtype='float32', + persistable=False, + stop_gradient=True) + mask_var = main_program.global_block().create_var( + name=mask_var_name, + shape=[1], + dtype='int', + persistable=False, + stop_gradient=True) + + main_program.global_block().append_op( + type="fill_constant", + outputs={"Out": x_var_name}, + attrs={ + "shape": [40, 40], + "dtype": x_var.dtype, + "value": 1.0, + "place_type": 0 + }) + main_program.global_block().append_op( + type='seed', + inputs={}, + outputs={'Out': seed_input_var}, + attrs={'seed': 1, + 'force_cpu': True}) + main_program.global_block().append_op( + type='dropout', + inputs={'X': x_var, + 'Seed': seed_input_var}, + attrs={'dropout_prob': 0.}, + outputs={'Out': x_out_var, + 'Mask': mask_var}) + place = fluid.CPUPlace() + if core.is_compiled_with_cuda(): + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + x_out, mask_out = exe.run( + main_program, + feed={}, + fetch_list=[x_out_var.name, mask_var.name]) + x_in_np = np.ones([40, 40]).astype("float32") + self.assertTrue(np.allclose(x_out, x_in_np)) + + class TestDropoutOpError(unittest.TestCase): def test_errors(self): with program_guard(Program(), Program()): diff --git a/python/paddle/fluid/tests/unittests/test_eig_op.py b/python/paddle/fluid/tests/unittests/test_eig_op.py new file mode 100644 index 0000000000000..bb83de7d0dd67 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eig_op.py @@ -0,0 +1,250 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from op_test import OpTest, skip_check_grad_ci +import unittest +from paddle.fluid.op import Operator +from paddle.fluid import compiler, Program, program_guard + + +# cast output to complex for numpy.linalg.eig +def cast_to_complex(input, output): + if (input.dtype == np.float32): + output = output.astype(np.complex64) + elif (input.dtype == np.float64): + output = output.astype(np.complex128) + return output + + +# define eig backward function for a single square matrix +def eig_backward(w, v, grad_w, grad_v): + v_tran = np.transpose(v) + v_tran = np.conjugate(v_tran) + w_conj = np.conjugate(w) + w_conj_l = w_conj.reshape(1, w.size) + w_conj_r = w_conj.reshape(w.size, 1) + w_conj_2d = w_conj_l - w_conj_r + + vhgv = np.matmul(v_tran, grad_v) + real_vhgv = np.real(vhgv) + diag_real = real_vhgv.diagonal() + + diag_2d = diag_real.reshape(1, w.size) + rhs = v * diag_2d + mid = np.matmul(v_tran, rhs) + result = vhgv - mid + + res = np.divide(result, w_conj_2d) + row, col = np.diag_indices_from(res) + res[row, col] = 1.0 + + tmp = np.matmul(res, v_tran) + dx = np.linalg.solve(v_tran, tmp) + return dx + + +class TestEigOp(OpTest): + def setUp(self): + paddle.enable_static() + paddle.device.set_device("cpu") + self.op_type = "eig" + self.__class__.op_type = self.op_type + self.init_input() + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(self.x)} + self.outputs = {'Eigenvalues': self.out[0], 'Eigenvectors': self.out[1]} + + def init_input(self): + self.set_dtype() + self.set_dims() + self.x = np.random.random(self.shape).astype(self.dtype) + self.out = np.linalg.eig(self.x) + self.out = (cast_to_complex(self.x, self.out[0]), + cast_to_complex(self.x, self.out[1])) + + # for the real input, a customized checker is needed + def checker(self, outs): + actual_out_w = outs[0].flatten() + expect_out_w = self.out[0].flatten() + actual_out_v = outs[1].flatten() + expect_out_v = self.out[1].flatten() + + length_w = len(expect_out_w) + act_w_real = np.sort( + np.array([np.abs(actual_out_w[i].real) for i in range(length_w)])) + act_w_imag = np.sort( + np.array([np.abs(actual_out_w[i].imag) for i in range(length_w)])) + exp_w_real = np.sort( + np.array([np.abs(expect_out_w[i].real) for i in range(length_w)])) + exp_w_imag = np.sort( + np.array([np.abs(expect_out_w[i].imag) for i in range(length_w)])) + + for i in range(length_w): + self.assertTrue( + np.allclose(act_w_real[i], exp_w_real[i], 1e-6, 1e-5), + "The eigenvalues real part have diff: \nExpected " + + str(act_w_real[i]) + "\n" + "But got: " + str(exp_w_real[i])) + self.assertTrue( + np.allclose(act_w_imag[i], exp_w_imag[i], 1e-6, 1e-5), + "The eigenvalues image part have diff: \nExpected " + + str(act_w_imag[i]) + "\n" + "But got: " + str(exp_w_imag[i])) + + length_v = len(expect_out_v) + act_v_real = np.sort( + np.array([np.abs(actual_out_v[i].real) for i in range(length_v)])) + act_v_imag = np.sort( + np.array([np.abs(actual_out_v[i].imag) for i in range(length_v)])) + exp_v_real = np.sort( + np.array([np.abs(expect_out_v[i].real) for i in range(length_v)])) + exp_v_imag = np.sort( + np.array([np.abs(expect_out_v[i].imag) for i in range(length_v)])) + + for i in range(length_v): + self.assertTrue( + np.allclose(act_v_real[i], exp_v_real[i], 1e-6, 1e-5), + "The eigenvectors real part have diff: \nExpected " + + str(act_v_real[i]) + "\n" + "But got: " + str(exp_v_real[i])) + self.assertTrue( + np.allclose(act_v_imag[i], exp_v_imag[i], 1e-6, 1e-5), + "The eigenvectors image part have diff: \nExpected " + + str(act_v_imag[i]) + "\n" + "But got: " + str(exp_v_imag[i])) + + def set_dtype(self): + self.dtype = np.complex64 + + def set_dims(self): + self.shape = (10, 10) + + def init_grad(self): + # grad_w, grad_v complex dtype + gtype = self.dtype + if self.dtype == np.float32: + gtype = np.complex64 + elif self.dtype == np.float64: + gtype = np.complex128 + self.grad_w = np.ones(self.out[0].shape, gtype) + self.grad_v = np.ones(self.out[1].shape, gtype) + self.grad_x = eig_backward(self.out[0], self.out[1], self.grad_w, + self.grad_v) + + def test_check_output(self): + self.check_output_with_place_customized( + checker=self.checker, place=core.CPUPlace()) + + def test_check_grad(self): + self.init_grad() + self.check_grad( + ['X'], ['Eigenvalues', 'Eigenvectors'], + user_defined_grads=[self.grad_x], + user_defined_grad_outputs=[self.grad_w, self.grad_v]) + + +class TestComplex128(TestEigOp): + def set_dtype(self): + self.dtype = np.complex128 + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestDouble(TestEigOp): + def set_dtype(self): + self.dtype = np.float64 + + def test_check_grad(self): + pass + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestEigBatchMarices(TestEigOp): + def set_dtype(self): + self.dtype = np.float64 + + def set_dims(self): + self.shape = (3, 10, 10) + + def test_check_grad(self): + pass + + +@skip_check_grad_ci( + reason="For float dtype, numpy.linalg.eig forward outputs real or complex when input is real, therefore the grad computation may be not the same with paddle.linalg.eig" +) +class TestFloat(TestEigOp): + def set_dtype(self): + self.dtype = np.float32 + + def test_check_grad(self): + pass + + +class TestEigStatic(TestEigOp): + def test_check_output_with_place(self): + paddle.enable_static() + place = core.CPUPlace() + input_np = np.random.random([3, 3]).astype('complex') + expect_val, expect_vec = np.linalg.eig(input_np) + with fluid.program_guard(fluid.Program(), fluid.Program()): + input = fluid.data(name="input", shape=[3, 3], dtype='complex') + act_val, act_vec = paddle.linalg.eig(input) + + exe = fluid.Executor(place) + fetch_val, fetch_vec = exe.run(fluid.default_main_program(), + feed={"input": input_np}, + fetch_list=[act_val, act_vec]) + self.assertTrue( + np.allclose(expect_val, fetch_val, 1e-6, 1e-6), + "The eigen values have diff: \nExpected " + str(expect_val) + "\n" + + "But got: " + str(fetch_val)) + self.assertTrue( + np.allclose(np.abs(expect_vec), np.abs(fetch_vec), 1e-6, 1e-6), + "The eigen vectors have diff: \nExpected " + + str(np.abs(expect_vec)) + "\n" + "But got: " + + str(np.abs(fetch_vec))) + + +class TestEigWrongDimsError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = np.random.random((3)).astype('float32') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigNotSquareError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = np.random.random((1, 2, 3)).astype('float32') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +class TestEigUnsupportedDtypeError(unittest.TestCase): + def test_error(self): + paddle.device.set_device("cpu") + paddle.disable_static() + a = (np.random.random((3, 3)) * 10).astype('int64') + x = paddle.to_tensor(a) + self.assertRaises(ValueError, paddle.linalg.eig, x) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py index db8689c14c30f..35b74eac4b075 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_hybrid_meta_optimizer.py @@ -72,8 +72,7 @@ def test_opt_sharding_with_pp(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -155,8 +154,7 @@ def test_opt_sharding_with_pp_with_allreduce_fuse(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -218,7 +216,7 @@ def test_opt_sharding_with_pp_amp_gclip(self): 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -292,7 +290,7 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -321,6 +319,82 @@ def test_opt_sharding_with_pp_amp_gclip_fuse_gm(self): 'c_broadcast' ]) + def test_opt_sharding_with_pp_amp_ckp_fuse_gm_optcast(self): + train_prog, startup_prog = static.Program(), static.Program() + avg_cost, strategy = self.pp_net(train_prog, startup_prog) + + self.set_strategy(strategy, 'pipeline') + self.set_strategy(strategy, 'amp') + strategy.amp_configs = {'custom_black_varnames': ['fc_6.b_0'], } + strategy.recompute = True + strategy.recompute_configs = { + "checkpoints": + ["fc_0.tmp_2", "fc_1.tmp_2", "fc_2.tmp_2", "fc_3.tmp_2"] + } + + strategy.sharding = True + strategy.sharding_configs = { + "sharding_degree": 1, + "pp_degree": 2, + "dp_degree": 2, + "_dp_as_optimizer_sharding": True, + 'optimize_cast': True, + } + strategy.fuse_all_reduce_ops = True + strategy.fuse_grad_size_in_MB = 32 + strategy.fuse_grad_merge = True + + self.optimizer(avg_cost, strategy, train_prog, startup_prog) + train_prog = train_prog._pipeline_opt['section_program'] + startup_prog = startup_prog._pipeline_opt['startup_program'] + + # self._debug = True + self.debug_program(train_prog, startup_prog) + + startup_prog_ops = startup_prog.global_block().ops + main_prog_ops = train_prog.global_block().ops + + # check program + startup_prog_op_types = [op.type for op in startup_prog_ops] + main_prog_op_types = [op.type for op in main_prog_ops] + + # global, sharding, pp_send, pp_recv + self.assertEqual(startup_prog_op_types, [ + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', + 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', + 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', + 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', + 'cast', 'c_broadcast' + ]) + + self.assertEqual(main_prog_op_types, [ + 'recv_v2', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', + 'mul', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'cast', + 'elementwise_add', 'cast', 'softmax', 'cast', 'cross_entropy2', + 'mean', 'elementwise_mul', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'coalesce_tensor', 'coalesce_tensor', + 'coalesce_tensor', 'fill_constant', 'elementwise_mul_grad', + 'mean_grad', 'cross_entropy_grad2', 'cast', 'softmax_grad', 'cast', + 'elementwise_add_grad', 'cast', 'mul_grad', 'cast', 'tanh_grad', + 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'tanh_grad', + 'cast', 'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', + 'elementwise_add', 'cast', 'tanh_grad', 'cast', + 'elementwise_add_grad', 'mul_grad', 'cast', 'c_sync_calc_stream', + 'send_v2', 'cast', 'sum', 'sum', 'cast', 'sum', 'c_reduce_sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'check_finite_and_unscale', 'cast', 'c_allreduce_max', + 'c_allreduce_max', 'cast', 'update_loss_scaling', 'momentum', + 'cast', 'momentum', 'cast', 'momentum', 'cast', 'momentum', + 'momentum', 'cast', 'coalesce_tensor', 'c_broadcast', 'c_broadcast', + 'coalesce_tensor', 'c_broadcast' + ]) + class TestFleetHybridOptimizerBoundary(TestFleetMetaOptimizer): def setUp(self): @@ -384,7 +458,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary(self): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', - 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' + 'c_comm_init', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -435,7 +509,7 @@ def test_opt_sharding_with_pp_amp_gclip_boundary_card1(self): 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 61d98d32ec5fd..7cb033b748874 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -655,7 +655,9 @@ def test_hybrid_with_mp_pp_amp_gclip(self): 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -764,7 +766,7 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -922,18 +924,17 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast(self): # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', - 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', - 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', - 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1019,20 +1020,18 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_offload(self): # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'memcpy', 'fill_constant', 'cast', - 'memcpy', 'uniform_random', 'cast', 'memcpy', 'fill_constant', - 'cast', 'memcpy', 'uniform_random', 'cast', 'memcpy', - 'fill_constant', 'cast', 'memcpy', 'uniform_random', 'cast', - 'memcpy', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast', 'cast', 'memcpy', 'c_broadcast', 'cast', 'memcpy', + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1122,18 +1121,17 @@ def test_hybrid_with_pp_dp_amp_fp16allreduce_optimize_cast_with_gradient_fuse( # ring: mp, pp_group, pp_pair, pp_pair self.assertEqual(startup_prog_op_types, [ - 'uniform_random', 'cast', 'fill_constant', 'cast', 'uniform_random', - 'cast', 'fill_constant', 'cast', 'uniform_random', 'cast', - 'fill_constant', 'cast', 'uniform_random', 'cast', 'fill_constant', - 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'uniform_random', 'fill_constant', 'uniform_random', + 'fill_constant', 'uniform_random', 'fill_constant', + 'uniform_random', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', 'fill_constant', + 'fill_constant', 'fill_constant', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast', 'cast', 'c_broadcast', 'cast', 'c_broadcast', 'cast', + 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ @@ -1225,7 +1223,7 @@ def test_hybrid_with_pp_dp_amp_with_gradient_fuse(self): 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_gen_nccl_id', 'c_comm_init', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast', - 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream' + 'c_broadcast', 'c_broadcast' ]) self.assertEqual(main_prog_op_types, [ diff --git a/python/paddle/fluid/tests/unittests/test_get_device_properties.py b/python/paddle/fluid/tests/unittests/test_get_device_properties.py new file mode 100644 index 0000000000000..4cfb91bfae93e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_get_device_properties.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +from paddle.fluid import core +from paddle.device.cuda import device_count, get_device_properties + + +class TestGetDeviceProperties(unittest.TestCase): + def test_get_device_properties_default(self): + if core.is_compiled_with_cuda(): + props = get_device_properties() + self.assertIsNotNone(props) + + def test_get_device_properties_str(self): + if core.is_compiled_with_cuda(): + props = get_device_properties('gpu:0') + self.assertIsNotNone(props) + + def test_get_device_properties_int(self): + if core.is_compiled_with_cuda(): + gpu_num = device_count() + for i in range(gpu_num): + props = get_device_properties(i) + self.assertIsNotNone(props) + + def test_get_device_properties_CUDAPlace(self): + if core.is_compiled_with_cuda(): + device = core.CUDAPlace(0) + props = get_device_properties(device) + self.assertIsNotNone(props) + + +class TestGetDevicePropertiesError(unittest.TestCase): + def test_error_api(self): + if core.is_compiled_with_cuda(): + + def test_device_indexError_error(): + device_error = device_count() + 1 + props = get_device_properties(device_error) + + self.assertRaises(IndexError, test_device_indexError_error) + + def test_device_value_error1(): + device_error = 'gpu1' + props = get_device_properties(device_error) + + self.assertRaises(ValueError, test_device_value_error1) + + def test_device_value_error2(): + device_error = float(device_count()) + props = get_device_properties(device_error) + + self.assertRaises(ValueError, test_device_value_error2) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_hdfs2.py b/python/paddle/fluid/tests/unittests/test_hdfs2.py index 1fa019bb9cd02..a74fc558382fe 100644 --- a/python/paddle/fluid/tests/unittests/test_hdfs2.py +++ b/python/paddle/fluid/tests/unittests/test_hdfs2.py @@ -35,6 +35,7 @@ def test_hdfs(self): self._test_rm(fs) self._test_touch(fs) self._test_dirs(fs) + self._test_list_files_info(fs) def test_local(self): fs = LocalFS() diff --git a/python/paddle/fluid/tests/unittests/test_linalg_cond.py b/python/paddle/fluid/tests/unittests/test_linalg_cond.py index 2b42eca38e6fc..237c96430249b 100644 --- a/python/paddle/fluid/tests/unittests/test_linalg_cond.py +++ b/python/paddle/fluid/tests/unittests/test_linalg_cond.py @@ -28,7 +28,7 @@ def test_static_assert_true(self, x_list, p_list): for x in x_list: with static.program_guard(static.Program(), static.Program()): input_data = static.data("X", shape=x.shape, dtype=x.dtype) - output = paddle.cond(input_data, p) + output = paddle.linalg.cond(input_data, p) exe = static.Executor() result = exe.run(feed={"X": x}, fetch_list=[output]) expected_output = np.linalg.cond(x, p) @@ -39,7 +39,7 @@ def test_dygraph_assert_true(self, x_list, p_list): for p in p_list: for x in x_list: input_tensor = paddle.to_tensor(x) - output = paddle.cond(input_tensor, p) + output = paddle.linalg.cond(input_tensor, p) expected_output = np.linalg.cond(x, p) self.assertTrue(np.allclose(output, expected_output)) @@ -103,12 +103,12 @@ def test_dygraph_api_error(self): for p in p_list_error: for x in (x_list_n_n + x_list_m_n): x_tensor = paddle.to_tensor(x) - self.assertRaises(ValueError, paddle.cond, x_tensor, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p) for p in p_list_n_n: for x in x_list_m_n: x_tensor = paddle.to_tensor(x) - self.assertRaises(ValueError, paddle.cond, x_tensor, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_tensor, p) def test_static_api_error(self): paddle.enable_static() @@ -119,13 +119,13 @@ def test_static_api_error(self): for x in (x_list_n_n + x_list_m_n): with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in p_list_n_n: for x in x_list_m_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) # it's not supported when input is an empty tensor in static mode def test_static_empty_input_error(self): @@ -136,13 +136,13 @@ def test_static_empty_input_error(self): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) for p in (p_list_n_n + p_list_m_n): for x in x_list_n_n: with static.program_guard(static.Program(), static.Program()): x_data = static.data("X", shape=x.shape, dtype=x.dtype) - self.assertRaises(ValueError, paddle.cond, x_data, p) + self.assertRaises(ValueError, paddle.linalg.cond, x_data, p) class TestCondEmptyTensorInput(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py index 97047b1ae0e5e..8856624b4efc7 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_dot_op.py +++ b/python/paddle/fluid/tests/unittests/test_multi_dot_op.py @@ -198,32 +198,34 @@ def test_errors(self): paddle.static.Program()): # The inputs type of multi_dot must be list matrix. input1 = 12 - self.assertRaises(TypeError, paddle.multi_dot, [input1, input1]) + self.assertRaises(TypeError, paddle.linalg.multi_dot, + [input1, input1]) # The inputs dtype of multi_dot must be float64, float64 or float16. input2 = paddle.static.data( name='input2', shape=[10, 10], dtype="int32") - self.assertRaises(TypeError, paddle.multi_dot, [input2, input2]) + self.assertRaises(TypeError, paddle.linalg.multi_dot, + [input2, input2]) # the number of tensor must be larger than 1 x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x0]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x0]) #the first tensor must be 1D or 2D x1 = paddle.static.data(name='x1', shape=[3, 2, 3], dtype="float64") x2 = paddle.static.data(name='x2', shape=[3, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x1, x2]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x1, x2]) #the last tensor must be 1D or 2D x3 = paddle.static.data(name='x3', shape=[3, 2], dtype="float64") x4 = paddle.static.data(name='x4', shape=[3, 2, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x3, x4]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x3, x4]) #the tensor must be 2D, except first and last tensor x5 = paddle.static.data(name='x5', shape=[3, 2], dtype="float64") x6 = paddle.static.data(name='x6', shape=[2], dtype="float64") x7 = paddle.static.data(name='x7', shape=[2, 2], dtype="float64") - self.assertRaises(ValueError, paddle.multi_dot, [x5, x6, x7]) + self.assertRaises(ValueError, paddle.linalg.multi_dot, [x5, x6, x7]) class APITestMultiDot(unittest.TestCase): @@ -232,7 +234,7 @@ def test_out(self): with paddle.static.program_guard(paddle.static.Program()): x0 = paddle.static.data(name='x0', shape=[3, 2], dtype="float64") x1 = paddle.static.data(name='x1', shape=[2, 3], dtype='float64') - result = paddle.multi_dot([x0, x1]) + result = paddle.linalg.multi_dot([x0, x1]) exe = paddle.static.Executor(paddle.CPUPlace()) data1 = np.random.rand(3, 2).astype("float64") data2 = np.random.rand(2, 3).astype("float64") @@ -254,7 +256,7 @@ def test_dygraph_without_out(self): input_array2 = np.random.rand(4, 3).astype("float64") data1 = paddle.to_tensor(input_array1) data2 = paddle.to_tensor(input_array2) - out = paddle.multi_dot([data1, data2]) + out = paddle.linalg.multi_dot([data1, data2]) expected_result = np.linalg.multi_dot([input_array1, input_array2]) self.assertTrue(np.allclose(expected_result, out.numpy())) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py index 7a4f7f9fbd62b..f54aa1bb6e556 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_pipeline_parallel.py @@ -42,6 +42,9 @@ def test_hybrid_parallel_save_load(self): def test_hybrid_parallel_recompute(self): self.run_mnist_2gpu('hybrid_parallel_pp_recompute.py') + def test_hybrid_parallel_pp_clip_grad(self): + self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py') + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py new file mode 100644 index 0000000000000..e8b1d838261f4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_run_cinn.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import paddle +import unittest + +paddle.enable_static() + + +class TestParallelExecutorRunCinn(unittest.TestCase): + def test_run_from_cinn(self): + paddle.set_flags({'FLAGS_use_cinn': True}) + + main_program = paddle.static.Program() + startup_program = paddle.static.Program() + with paddle.static.program_guard(main_program, startup_program): + data = paddle.static.data( + name='X', shape=[None, 1], dtype='float32') + prediction = paddle.static.nn.fc(data, 2) + loss = paddle.mean(prediction) + adam = paddle.optimizer.Adam() + adam.minimize(loss) + + place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda( + ) else paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_program) + compiled_program = paddle.static.CompiledProgram( + main_program).with_data_parallel(loss_name=loss.name) + + batch_size = 16 + x = np.random.random(size=(batch_size, 1)).astype('float32') + fetch = exe.run(compiled_program, + feed={'X': x}, + fetch_list=[prediction.name], + return_merged=False) + + paddle.set_flags({'FLAGS_use_cinn': False}) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py index 066bcf48612c5..95b8c5c3c0a94 100644 --- a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py +++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py @@ -14,18 +14,89 @@ from __future__ import print_function +import paddle import math import numpy as np import unittest from op_test import OpTest +def calc_psroi_pool(x, rois, rois_num_per_img, output_channels, spatial_scale, + pooled_height, pooled_width): + """ + Psroi_pool implemented by Numpy. + x: 4-D as (N, C, H, W), + rois: 2-D as [[x1, y1, x2, y2], ...], + rois_num_per_img: 1-D as [nums_of_batch_0, nums_of_batch_1, ...] + """ + output_shape = (len(rois), output_channels, pooled_height, pooled_width) + out_data = np.zeros(output_shape) + batch_id = 0 + rois_num_id = 0 + rois_num_left = rois_num_per_img[rois_num_id] + for i in range(len(rois)): + roi = rois[i] + roi_batch_id = batch_id + rois_num_left -= 1 + if rois_num_left == 0: + rois_num_id += 1 + if rois_num_id < len(rois_num_per_img): + rois_num_left = rois_num_per_img[rois_num_id] + batch_id += 1 + roi_start_w = round(roi[0]) * spatial_scale + roi_start_h = round(roi[1]) * spatial_scale + roi_end_w = (round(roi[2]) + 1.) * spatial_scale + roi_end_h = (round(roi[3]) + 1.) * spatial_scale + + roi_height = max(roi_end_h - roi_start_h, 0.1) + roi_width = max(roi_end_w - roi_start_w, 0.1) + + bin_size_h = roi_height / float(pooled_height) + bin_size_w = roi_width / float(pooled_width) + + x_i = x[roi_batch_id] + + for c in range(output_channels): + for ph in range(pooled_height): + for pw in range(pooled_width): + hstart = int( + math.floor(float(ph) * bin_size_h + roi_start_h)) + wstart = int( + math.floor(float(pw) * bin_size_w + roi_start_w)) + hend = int( + math.ceil(float(ph + 1) * bin_size_h + roi_start_h)) + wend = int( + math.ceil(float(pw + 1) * bin_size_w + roi_start_w)) + hstart = min(max(hstart, 0), x.shape[2]) + hend = min(max(hend, 0), x.shape[2]) + wstart = min(max(wstart, 0), x.shape[3]) + wend = min(max(wend, 0), x.shape[3]) + + c_in = (c * pooled_height + ph) * pooled_width + pw + is_empty = (hend <= hstart) or (wend <= wstart) + out_sum = 0. + for ih in range(hstart, hend): + for iw in range(wstart, wend): + out_sum += x_i[c_in, ih, iw] + bin_area = (hend - hstart) * (wend - wstart) + out_data[i, c, ph, pw] = 0. if is_empty else ( + out_sum / float(bin_area)) + return out_data + + class TestPSROIPoolOp(OpTest): def set_data(self): + paddle.enable_static() self.init_test_case() self.make_rois() - self.calc_psroi_pool() - self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.outs = calc_psroi_pool(self.x, self.boxes, self.boxes_num, + self.output_channels, self.spatial_scale, + self.pooled_height, + self.pooled_width).astype('float64') + self.inputs = { + 'X': self.x, + 'ROIs': (self.rois_with_batch_id[:, 1:5], self.rois_lod) + } self.attrs = { 'output_channels': self.output_channels, 'spatial_scale': self.spatial_scale, @@ -67,57 +138,10 @@ def make_rois(self): roi = [bno, x1, y1, x2, y2] rois.append(roi) self.rois_num = len(rois) - self.rois = np.array(rois).astype('float64') - - def calc_psroi_pool(self): - output_shape = (self.rois_num, self.output_channels, self.pooled_height, - self.pooled_width) - out_data = np.zeros(output_shape) - for i in range(self.rois_num): - roi = self.rois[i] - roi_batch_id = int(roi[0]) - roi_start_w = round(roi[1]) * self.spatial_scale - roi_start_h = round(roi[2]) * self.spatial_scale - roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale - roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale - - roi_height = max(roi_end_h - roi_start_h, 0.1) - roi_width = max(roi_end_w - roi_start_w, 0.1) - - bin_size_h = roi_height / float(self.pooled_height) - bin_size_w = roi_width / float(self.pooled_width) - - x_i = self.x[roi_batch_id] - - for c in range(self.output_channels): - for ph in range(self.pooled_height): - for pw in range(self.pooled_width): - hstart = int( - math.floor(float(ph) * bin_size_h + roi_start_h)) - wstart = int( - math.floor(float(pw) * bin_size_w + roi_start_w)) - hend = int( - math.ceil( - float(ph + 1) * bin_size_h + roi_start_h)) - wend = int( - math.ceil( - float(pw + 1) * bin_size_w + roi_start_w)) - hstart = min(max(hstart, 0), self.height) - hend = min(max(hend, 0), self.height) - wstart = min(max(wstart, 0), self.width) - wend = min(max(wend, 0), self.width) - - c_in = (c * self.pooled_height + ph - ) * self.pooled_width + pw - is_empty = (hend <= hstart) or (wend <= wstart) - out_sum = 0. - for ih in range(hstart, hend): - for iw in range(wstart, wend): - out_sum += x_i[c_in, ih, iw] - bin_area = (hend - hstart) * (wend - wstart) - out_data[i, c, ph, pw] = 0. if is_empty else ( - out_sum / float(bin_area)) - self.outs = out_data.astype('float64') + self.rois_with_batch_id = np.array(rois).astype('float64') + self.boxes = self.rois_with_batch_id[:, 1:] + self.boxes_num = np.array( + [bno + 1 for bno in range(self.batch_size)]).astype('int32') def setUp(self): self.op_type = 'psroi_pool' @@ -130,5 +154,175 @@ def test_check_grad(self): self.check_grad(['X'], 'Out') +class TestPSROIPoolDynamicFunctionAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.random([2, 490, 28, 28]).astype(np.float32) + self.boxes = np.array( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32) + self.boxes_num = np.array([1, 2]).astype(np.int32) + + def test_output_size(self): + def test_output_size_is_int(): + output_size = 7 + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + def test_output_size_is_tuple(): + output_size = (7, 7) + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + def test_dytype_is_float64(): + output_size = (7, 7) + out = paddle.vision.ops.psroi_pool( + paddle.to_tensor(self.x, 'float64'), + paddle.to_tensor(self.boxes, 'float64'), + paddle.to_tensor(self.boxes_num, 'int32'), output_size).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + self.assertTrue(np.allclose(out, expect_out)) + + places = ['cpu'] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append('gpu') + for place in places: + paddle.set_device(place) + test_output_size_is_int() + test_output_size_is_tuple() + test_dytype_is_float64() + + +class TestPSROIPoolDynamicClassAPI(unittest.TestCase): + def setUp(self): + self.x = np.random.random([2, 128, 32, 32]).astype(np.float32) + self.boxes = np.array([[3, 5, 6, 13], [7, 4, 22, 18], [4, 5, 7, 10], + [5, 3, 25, 21]]).astype(np.float32) + self.boxes_num = np.array([2, 2]).astype(np.int32) + + def test_output_size(self): + def test_output_size_is_int(): + psroi_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_module( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num)).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + def test_output_size_is_tuple(): + psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_pool_module( + paddle.to_tensor(self.x), + paddle.to_tensor(self.boxes), + paddle.to_tensor(self.boxes_num)).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + def test_dytype_is_float64(): + psroi_pool_module = paddle.vision.ops.PSRoIPool(8, 1.1) + out = psroi_pool_module( + paddle.to_tensor(self.x, 'float64'), + paddle.to_tensor(self.boxes, 'float64'), + paddle.to_tensor(self.boxes_num, 'int32')).numpy() + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 2, + 1.1, 8, 8) + self.assertTrue(np.allclose(out, expect_out)) + + paddle.disable_static() + places = ['cpu'] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append('gpu') + for place in places: + paddle.set_device(place) + test_output_size_is_int() + test_output_size_is_tuple() + test_dytype_is_float64() + + +class TestPSROIPoolBoxesNumError(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.x = paddle.uniform([2, 490, 28, 28], dtype='float32') + self.boxes = paddle.to_tensor( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32') + + def test_errors(self): + def test_boxes_num_nums_error(): + boxes_num = paddle.to_tensor([1, 5], 'int32') + out = paddle.vision.ops.psroi_pool( + self.x, self.boxes, boxes_num, output_size=7) + + self.assertRaises(ValueError, test_boxes_num_nums_error) + + def test_boxes_num_length_error(): + boxes_num = paddle.to_tensor([1, 1, 1], 'int32') + out = paddle.vision.ops.psroi_pool( + self.x, self.boxes, boxes_num, output_size=7) + + self.assertRaises(ValueError, test_boxes_num_length_error) + + +class TestPSROIPoolChannelError(unittest.TestCase): + def setUp(self): + paddle.disable_static() + self.x = paddle.uniform([2, 490, 28, 28], dtype='float32') + self.boxes = paddle.to_tensor( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], 'float32') + self.output_size = 4 + + def test_errors(self): + def test_channel_error(): + boxes_num = paddle.to_tensor([2, 1], 'int32') + out = paddle.vision.ops.psroi_pool(self.x, self.boxes, boxes_num, + self.output_size) + + self.assertRaises(ValueError, test_channel_error) + + +class TestPSROIPoolStaticAPI(unittest.TestCase): + def setUp(self): + paddle.enable_static() + self.x_placeholder = paddle.static.data( + name='x', shape=[2, 490, 28, 28]) + self.x = np.random.random([2, 490, 28, 28]).astype(np.float32) + self.boxes_placeholder = paddle.static.data( + name='boxes', shape=[3, 4], lod_level=1) + self.boxes = np.array( + [[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]]).astype(np.float32) + self.boxes_num = np.array([1, 2]).astype(np.int32) + + def test_function_in_static(self): + output_size = 7 + out = paddle.vision.ops.psroi_pool(self.x_placeholder, + self.boxes_placeholder, + self.boxes_num, output_size) + expect_out = calc_psroi_pool(self.x, self.boxes, self.boxes_num, 10, + 1.0, 7, 7) + places = [paddle.CPUPlace()] + if paddle.fluid.core.is_compiled_with_cuda(): + places.append(paddle.CUDAPlace(0)) + for place in places: + exe = paddle.static.Executor(place) + boxes_lod_data = paddle.fluid.create_lod_tensor(self.boxes, + [[1, 2]], place) + out_res = exe.run(paddle.static.default_main_program(), + feed={'x': self.x, + 'boxes': boxes_lod_data}, + fetch_list=[out.name]) + self.assertTrue(np.allclose(out_res, expect_out)) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_rnn_dp.py b/python/paddle/fluid/tests/unittests/test_rnn_dp.py new file mode 100644 index 0000000000000..8d7e86fcdb9c7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_rnn_dp.py @@ -0,0 +1,157 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import os + +import numpy as np +import paddle +import paddle.static as static +import paddle.distributed.fleet as fleet +import paddle.nn as nn +import paddle.nn.functional as F + +paddle.enable_static() + + +class RNNEncoder(nn.Layer): + def __init__(self, + input_size, + hidden_size, + num_layers=1, + direction="forward", + dropout=0.0, + pooling_type=None, + **kwargs): + super().__init__() + self._input_size = input_size + self._hidden_size = hidden_size + self._direction = direction + self._pooling_type = pooling_type + + self.rnn_layer = nn.SimpleRNN( + input_size=input_size, + hidden_size=hidden_size, + num_layers=num_layers, + direction=direction, + dropout=dropout, + **kwargs) + + def get_input_dim(self): + return self._input_size + + def get_output_dim(self): + if self._direction == "bidirect": + return self._hidden_size * 2 + else: + return self._hidden_size + + def forward(self, inputs, sequence_length): + encoded_text, last_hidden = self.rnn_layer( + inputs, sequence_length=sequence_length) + output = paddle.max(encoded_text, axis=1) + return output + + +class RNNModel(nn.Layer): + def __init__(self, + vocab_size, + num_classes, + emb_dim=128, + padding_idx=0, + rnn_hidden_size=198, + direction='forward', + rnn_layers=1, + dropout_rate=0.0, + pooling_type=None, + fc_hidden_size=96): + super().__init__() + self.embedder = nn.Embedding( + num_embeddings=vocab_size, + embedding_dim=emb_dim, + padding_idx=padding_idx) + self.rnn_encoder = RNNEncoder( + emb_dim, + rnn_hidden_size, + num_layers=rnn_layers, + direction=direction, + dropout=dropout_rate, + pooling_type=pooling_type) + self.fc = nn.Linear(self.rnn_encoder.get_output_dim(), fc_hidden_size) + self.output_layer = nn.Linear(fc_hidden_size, num_classes) + + def forward(self, text, seq_len): + embedded_text = self.embedder(text) + text_repr = self.rnn_encoder(embedded_text, sequence_length=seq_len) + fc_out = paddle.tanh(self.fc(text_repr)) + logits = self.output_layer(fc_out) + return logits + + +def rnn_pretrain_forward(train_program, start_program, topo=None): + with static.program_guard(train_program, + start_program), paddle.utils.unique_name.guard(): + batch_size = 1 + tokens = static.data( + name="tokens", shape=[batch_size, -1], dtype="int64") + seq_len = static.data(name="ids", shape=[batch_size], dtype="int64") + labels = static.data(name="labels", shape=[batch_size], dtype="int64") + data_holders = [tokens, seq_len, labels] + vocab_size = 10 + num_classes = 2 + pad_token_id = 0 + model = RNNModel( + vocab_size, + num_classes, + direction='forward', + padding_idx=pad_token_id, + pooling_type='max') + + optimizer = paddle.optimizer.Adam( + parameters=model.parameters(), learning_rate=0.001) + criterion = paddle.nn.CrossEntropyLoss() + preds = model(tokens, seq_len) + loss = criterion(preds, labels) + + return train_program, start_program, loss, optimizer, data_holders + + +class TestFleetMetaOptimizer(unittest.TestCase): + def setUp(self): + os.environ["PADDLE_TRAINER_ID"] = "1" + os.environ[ + "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" + + def test_rnn_raw_optimizer(self): + import paddle.distributed.fleet as fleet + import paddle.distributed.fleet.base.role_maker as role_maker + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + train_program = static.Program() + start_program = static.Program() + train_program, start_program, loss, optimizer, data_holders = \ + rnn_pretrain_forward(train_program, start_program) + with paddle.static.program_guard( + train_program, start_program), paddle.utils.unique_name.guard(): + strategy = fleet.DistributedStrategy() + strategy.without_graph_optimization = True + strategy.fuse_all_reduce_ops = True + fleet.init(is_collective=True, strategy=strategy) + optimizer = fleet.distributed_optimizer(optimizer) + optimizer.minimize(loss) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_run_program_op.py b/python/paddle/fluid/tests/unittests/test_run_program_op.py index b3d0845a4fbbc..33b32a6632c9e 100644 --- a/python/paddle/fluid/tests/unittests/test_run_program_op.py +++ b/python/paddle/fluid/tests/unittests/test_run_program_op.py @@ -343,5 +343,53 @@ def build_model(self): return fwd_op_num +class Net(paddle.nn.Layer): + def __init__(self): + super(Net, self).__init__() + self.fc1 = paddle.nn.Linear(10, 10) + self.fc2 = paddle.nn.Linear(10, 1) + + def forward(self, x): + out = self.fc1(x) + out.stop_gradient = True + out = self.fc2(out) + return out + + +class TestParametersWithStopGradient(unittest.TestCase): + def setUp(self): + self.seed = 2021 + self.iter = 5 + + def train(self, to_static): + # prepare env + paddle.seed(self.seed) + + net = Net() + if to_static: + net = paddle.jit.to_static(net) + sgd = paddle.optimizer.SGD(0.01, parameters=net.parameters()) + + for i in range(self.iter): + x = paddle.rand([4, 10]) + out = net(x) + loss = paddle.mean(out) + + loss.backward() + sgd.minimize(loss) + net.clear_gradients() + + return loss + + def test_stop_gradient(self): + paddle.disable_static() + + dy_loss = self.train(to_static=False) + st_loss = self.train(to_static=True) + self.assertEqual(dy_loss[0], st_loss[0]) + + paddle.enable_static() + + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_seed_op.py b/python/paddle/fluid/tests/unittests/test_seed_op.py index 7d6705f72569b..08478d7140d43 100644 --- a/python/paddle/fluid/tests/unittests/test_seed_op.py +++ b/python/paddle/fluid/tests/unittests/test_seed_op.py @@ -25,7 +25,7 @@ def setUp(self): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 123} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output() @@ -36,7 +36,7 @@ def setUp(self): self.op_type = "seed" self.inputs = {} self.attrs = {"seed": 0} - self.outputs = {"Out": np.asarray((123)).astype('int32')} + self.outputs = {"Out": np.asarray((123)).astype('int')} def test_check_output(self): self.check_output(no_check_set=["Out"]) diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py new file mode 100644 index 0000000000000..5134b885f3307 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py @@ -0,0 +1,318 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest +import paddle.fluid.core as core +from paddle.static import Program, program_guard +import paddle +import paddle.fluid as fluid +import paddle.fluid.framework as framework +import paddle.nn.functional as F +import os +import re + + +def get_cuda_version(): + result = os.popen("nvcc --version").read() + regex = r'release (\S+),' + match = re.search(regex, result) + if match: + num = str(match.group(1)) + integer, decimal = num.split('.') + return int(integer) * 1000 + int(float(decimal) * 10) + else: + return -1 + + +def softmax(x): + max = np.max(x, axis=1, keepdims=True) + e_x = np.exp(x - max) + sum = np.sum(e_x, axis=1, keepdims=True) + f_x = e_x / sum + return f_x + + +def get_csr_value(mat, layout, nnz): + row, col = mat.shape[0], mat.shape[1] + value = np.zeros(nnz) + ptr = 0 + for i in range(row): + for j in range(col): + if layout[i][j] == 1: + value[ptr] = mat[i][j] + ptr += 1 + return value + + +def ref_sparse_attention(q, k, v, offset, columns): + row, col, nnz = q.shape[0], q.shape[1], columns.shape[0] + mat = np.zeros((row, row)) + for cur_row in range(row): + start_ptr = int(offset[cur_row]) + end_ptr = int(offset[cur_row + 1]) + for ptr in range(start_ptr, end_ptr): + cur_col = int(columns[ptr]) + mat[cur_row][cur_col] = 1 + a = np.dot(q, k.T) * mat + a_value = get_csr_value(a, mat, nnz) + scaling = float(col)**-0.5 + a = scaling * a + for i in range(row): + for j in range(row): + if mat[i][j] == 0: + a[i][j] = float('-inf') + b = softmax(a) + b_value = get_csr_value(b, mat, nnz) + result = np.dot(b, v) + return result, a_value, b_value + + +def ref_batch_sparse_attention(q, k, v, offset, columns): + batch_size, num_heads, row, col = q.shape + nnz = columns.shape[2] + result = np.zeros((batch_size, num_heads, row, col)) + result_sdd = np.zeros((batch_size, num_heads, nnz)) + result_softmax = np.zeros((batch_size, num_heads, nnz)) + for i in range(batch_size): + for j in range(num_heads): + cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j] + cur_offset, cur_columns = offset[i][j], columns[i][j] + cur_result, cur_sdd, cur_softmax = ref_sparse_attention( + cur_q, cur_k, cur_v, cur_offset, cur_columns) + result[i][j] = cur_result + result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax + return result, result_sdd, result_softmax + + +def init_csr_format(batch_size, num_heads, rows, blocksize): + block_num, block_last = rows / blocksize, rows % blocksize + nnz_num = block_num * blocksize * blocksize + block_last * block_last + offset = np.zeros(rows + 1) + columns = np.zeros(int(nnz_num)) + mat = np.zeros((rows, rows)) + for i in range(0, rows, blocksize): + for x in range(blocksize): + for y in range(blocksize): + p_x, p_y = i + x, i + y + if (p_x < rows) and (p_y < rows): + mat[p_x][p_y] = 1 + p_offset, p_column, count = 0, 0, 0 + for i in range(rows): + for j in range(rows): + if mat[i][j] != 0: + count += 1 + columns[p_column] = j + p_column += 1 + p_offset += 1 + offset[p_offset] = count + offset = np.expand_dims(np.expand_dims(offset, 0), 0) + offset = offset.repeat(num_heads, axis=1) + offset = offset.repeat(batch_size, axis=0) + columns = np.expand_dims(np.expand_dims(columns, 0), 0) + columns = columns.repeat(num_heads, axis=1) + columns = columns.repeat(batch_size, axis=0) + return offset, columns + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) +class TestSparseAttentionOp(OpTest): + def config(self): + self.shape = (1, 1, 16, 8) + self.blocksize = 2 + self.dtype = "float64" + + def setUp(self): + paddle.enable_static() + self.config() + self.op_type = "sparse_attention" + self.place = paddle.CUDAPlace(0) + self.q = np.random.random(self.shape).astype(self.dtype) + self.k = np.random.random(self.shape).astype(self.dtype) + self.v = np.random.random(self.shape).astype(self.dtype) + offset, columns = init_csr_format(self.shape[0], self.shape[1], + self.shape[2], self.blocksize) + self.offset = offset.astype('int32') + self.columns = columns.astype('int32') + + result, result_sdd, result_softmax = ref_batch_sparse_attention( + self.q, self.k, self.v, self.offset, self.columns) + + self.inputs = { + 'Q': self.q, + 'K': self.k, + 'V': self.v, + 'Offset': self.offset, + 'Columns': self.columns + } + self.outputs = { + 'Out': result.astype(self.dtype), + 'SparseDotSdd': result_sdd.astype(self.dtype), + 'Softmax': result_softmax.astype(self.dtype) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['Q'], 'Out') + self.check_grad_with_place(self.place, ['K'], 'Out') + self.check_grad_with_place(self.place, ['V'], 'Out') + + +class TestSparseAttentionOpFp32Test(TestSparseAttentionOp): + def config(self): + self.shape = (1, 1, 8, 16) + self.blocksize = 2 + self.dtype = "float32" + + +class TestSparseAttentionOpShapeTest(TestSparseAttentionOp): + def config(self): + self.shape = (2, 2, 32, 8) + self.blocksize = 8 + self.dtype = "float64" + + +@unittest.skipIf( + not core.is_compiled_with_cuda() or get_cuda_version() < 11020, + "core is not compiled with CUDA and cuda version need larger than or equal to 11.2" +) +class TestSparseAttentionAPI(unittest.TestCase): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (1, 1, 8, 4) + self.blocksize = 2 + self.dtype = 'float64' + + def test_static_graph(self): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program()): + Q = paddle.static.data(name="Q", shape=self.shape, dtype=self.dtype) + K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype) + V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype) + + batch_size, num_heads, rows = self.shape[0], self.shape[ + 1], self.shape[2] + block_num = rows / self.blocksize + block_last = rows % self.blocksize + sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last + offset_shape = (batch_size, num_heads, rows + 1) + columns_shape = (batch_size, num_heads, int(sparse_nnz_num)) + + offset = paddle.static.data( + name="Offset", shape=offset_shape, dtype="int32") + columns = paddle.static.data( + name="Columns", shape=columns_shape, dtype="int32") + Out = F.sparse_attention(Q, K, V, offset, columns) + + Q_np = np.random.random(self.shape).astype(self.dtype) + K_np = np.random.random(self.shape).astype(self.dtype) + V_np = np.random.random(self.shape).astype(self.dtype) + offset_np, columns_np = init_csr_format( + self.shape[0], self.shape[1], self.shape[2], self.blocksize) + offset_np = offset_np.astype('int32') + columns_np = columns_np.astype('int32') + + exe = fluid.Executor(self.place) + fetches_result = exe.run(feed={ + "Q": Q_np, + "K": K_np, + "V": V_np, + "Offset": offset_np, + "Columns": columns_np + }, + fetch_list=[Out]) + expected_result, __, __ = ref_batch_sparse_attention( + Q_np, K_np, V_np, offset_np, columns_np) + + self.assertTrue( + np.allclose( + fetches_result, expected_result, atol=1e-5)) + + def test_dygraph(self): + paddle.disable_static() + offset, columns = init_csr_format(self.shape[0], self.shape[1], + self.shape[2], self.blocksize) + offset = offset.astype('int32') + columns = columns.astype('int32') + query = np.random.random(self.shape).astype(self.dtype) + key = np.random.random(self.shape).astype(self.dtype) + value = np.random.random(self.shape).astype(self.dtype) + + paddle_query = paddle.to_tensor(query, place=self.place) + paddle_key = paddle.to_tensor(key, place=self.place) + paddle_value = paddle.to_tensor(value, place=self.place) + paddle_offset = paddle.to_tensor(offset, place=self.place) + paddle_colunmns = paddle.to_tensor(columns, place=self.place) + + paddle_result = F.sparse_attention(paddle_query, paddle_key, + paddle_value, paddle_offset, + paddle_colunmns) + + numpy_result, __, __ = ref_batch_sparse_attention(query, key, value, + offset, columns) + numpy_result = numpy_result.astype(self.dtype) + + self.assertTrue( + np.allclose( + paddle_result.numpy(), numpy_result, atol=1e-5)) + + +class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 8, 4) + self.blocksize = 2 + self.dtype = 'float32' + + +class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 2, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (2, 1, 64, 32) + self.blocksize = 2 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (4, 4, 128, 32) + self.blocksize = 8 + self.dtype = 'float64' + + +class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI): + def setUp(self): + self.place = paddle.CUDAPlace(0) + self.shape = (3, 3, 35, 15) + self.blocksize = 3 + self.dtype = 'float64' + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py index 41a8a9750cb64..3beb6a537eca0 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py +++ b/python/paddle/fluid/tests/unittests/test_tensor_fill_diagonal_.py @@ -50,6 +50,36 @@ def test_dim2_normal(self): (y.grad.numpy().astype('float32') == expected_grad).all(), True) + def test_offset(self): + expected_np = np.array( + [[2, 2, 1], [2, 2, 2], [2, 2, 2]]).astype('float32') + expected_grad = np.array( + [[1, 1, 0], [1, 1, 1], [1, 1, 1]]).astype('float32') + + typelist = ['float32', 'float64', 'int32', 'int64'] + places = [fluid.CPUPlace()] + if fluid.core.is_compiled_with_cuda(): + places.append(fluid.CUDAPlace(0)) + + for idx, p in enumerate(places): + if idx == 0: + paddle.set_device('cpu') + else: + paddle.set_device('gpu') + for dtype in typelist: + x = paddle.ones((3, 3), dtype=dtype) + x.stop_gradient = False + y = x * 2 + y.fill_diagonal_(1, offset=2, wrap=True) + loss = y.sum() + loss.backward() + + self.assertEqual( + (y.numpy().astype('float32') == expected_np).all(), True) + self.assertEqual( + (y.grad.numpy().astype('float32') == expected_grad).all(), + True) + def test_bool(self): expected_np = np.array( [[False, True, True], [True, False, True], [True, True, False]]) diff --git a/python/paddle/fluid/tests/unittests/test_tensordot.py b/python/paddle/fluid/tests/unittests/test_tensordot.py new file mode 100644 index 0000000000000..29f3308988f6d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_tensordot.py @@ -0,0 +1,238 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import paddle.fluid.core as core +import numpy as np +import itertools as it + +np.set_printoptions(threshold=np.inf) + + +def tensordot_np(x, y, axes): + if isinstance(axes, paddle.fluid.framework.Variable): + axes = axes.tolist() + + # np.tensordot does not support empty axes + if not axes: + axes = 0 + if (isinstance(axes, (tuple, list))): + if all(np.issubdtype(type(i), np.integer) for i in axes): + axes = [axes, axes] + else: + axes_x = axes[0] + if len(axes) > 1: + axes_y = axes[1] + else: + axes_y = axes_x + len_axes_x, len_axes_y = len(axes_x), len(axes_y) + if len_axes_x < len_axes_y: + axes_x = axes_x + axes_y[len_axes_x:] + elif len_axes_y < len_axes_x: + axes_y = axes_y + axes_x[len_axes_y:] + axes = [axes_x, axes_y] + + # np.tensordot does not support broadcast + if (isinstance(axes, (tuple, list))): + axes_x, axes_y = axes + else: + axes_x = list(range(x.ndim - axes, x.ndim)) + axes_y = list(range(axes)) + shape_x, shape_y = list(np.shape(x)), list(np.shape(y)) + for i in range(len(axes_x)): + dim_x, dim_y = axes_x[i], axes_y[i] + sx, sy = shape_x[dim_x], shape_y[dim_y] + if sx == 1: + shape_y[dim_y] = 1 + y = np.sum(y, dim_y) + y = np.reshape(y, shape_y) + elif sy == 1: + shape_x[dim_x] = 1 + x = np.sum(x, dim_x) + x = np.reshape(x, shape_x) + + return np.tensordot(x, y, axes) + + +class TestTensordotAPI(unittest.TestCase): + def setUp(self): + self.set_dtype() + self.set_input_shape() + self.set_input_data() + + def set_dtype(self): + self.dtype = np.float32 + + def set_input_shape(self): + self.x_shape = [5, 5, 5, 5] + self.y_shape = [5, 5, 5, 5] + + def set_input_data(self): + self.x = np.random.random(self.x_shape).astype(self.dtype) + self.y = np.random.random(self.y_shape).astype(self.dtype) + self.all_axes = [2] + + def run_dygraph(self, place): + paddle.disable_static() + x = paddle.to_tensor(self.x, place=place) + y = paddle.to_tensor(self.y, place=place) + paddle_res = paddle.tensordot(x, y, self.axes) + np_res = tensordot_np(self.x, self.y, self.axes) + np.testing.assert_allclose(paddle_res, np_res, rtol=1e-6) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + x = paddle.static.data( + name='x', shape=self.x_shape, dtype=self.dtype) + y = paddle.static.data( + name='y', shape=self.y_shape, dtype=self.dtype) + z = paddle.tensordot(x, y, self.axes) + exe = paddle.static.Executor(place) + paddle_res = exe.run(feed={'x': self.x, + 'y': self.y}, + fetch_list=[z]) + np_res = tensordot_np(self.x, self.y, self.axes) + np.testing.assert_allclose(paddle_res[0], np_res, rtol=1e-6) + + def test_cases(self): + self.all_axes = [] + axial_index = range(4) + all_permutations = list(it.permutations(axial_index, 0)) + list( + it.permutations(axial_index, 1)) + list( + it.permutations(axial_index, 2)) + list( + it.permutations(axial_index, 3)) + list( + it.permutations(axial_index, 4)) + self.all_axes.extend(list(i) for i in all_permutations) + + for axes_x in all_permutations: + for axes_y in all_permutations: + if len(axes_x) < len(axes_y): + supplementary_axes_x = axes_x + axes_y[len(axes_x):] + if any( + supplementary_axes_x.count(i) > 1 + for i in supplementary_axes_x): + continue + elif len(axes_y) < len(axes_x): + supplementary_axes_y = axes_y + axes_x[len(axes_y):] + if any( + supplementary_axes_y.count(i) > 1 + for i in supplementary_axes_y): + continue + self.all_axes.append([list(axes_x), list(axes_y)]) + + self.all_axes.extend(range(5)) + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + self.run_static(place) + + +class TestTensordotAPIFloat64(TestTensordotAPI): + def set_dtype(self): + self.dtype = np.float64 + + +class TestTensordotAPIAxesType(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [3, 4, 4] + self.y_shape = [4, 4, 5] + + def test_cases(self): + self.all_axes = [ + 0, 1, 2, (1, ), [1], ((1, ), ), ([1], ), ((2, 1), (0, )), ( + (1, 2), (0, 1)), ([1, 2], [0, 1]), ([1, 2], [0, 1]), + [[1, 2], [0, 1]] + ] + + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + self.run_static(place) + + # The 'axes' with type 'Tensor' in tensordot is not available in static mode + paddle.disable_static() + for place in places: + self.all_axes = [ + paddle.to_tensor([1]), (paddle.to_tensor([1])), + (paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])), + [paddle.to_tensor([1, 2]), paddle.to_tensor([0, 1])], + paddle.to_tensor([[1, 2], [0, 1]]) + ] + for axes in self.all_axes: + self.axes = axes + for place in places: + self.run_dygraph(place) + + def test_error(self): + self.all_axes = [[[[0], [1]]], 0.1, -1, 100, [[1, 2], [0, 0]], + [[1, 2], [0, -1]], [0, 1, 2, 3]] + paddle.disable_static() + x = paddle.to_tensor(self.x) + y = paddle.to_tensor(self.y) + for axes in self.all_axes: + with self.assertRaises(BaseException): + paddle.tensordot(x, y, axes) + + +class TestTensordotAPIAxesTypeFloat64(TestTensordotAPIAxesType): + def set_dtype(self): + self.dtype = np.float64 + + +class TestTensordotAPIBroadcastCase1(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 1, 1, 5] + self.y_shape = [1, 5, 1, 1] + + +class TestTensordotAPIBroadcastCase2(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 5, 5, 5] + self.y_shape = [1, 1, 1, 5] + + +class TestTensordotAPIBroadcastCase3(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [5, 5, 5, 1] + self.y_shape = [5, 5, 1, 5] + + +class TestTensordotAPIBroadcastCase4(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [5, 5, 5, 1] + self.y_shape = [1, 1, 1, 1] + + +class TestTensordotAPIBroadcastCase5(TestTensordotAPI): + def set_input_shape(self): + self.x_shape = [1, 1, 5, 5] + self.y_shape = [5, 5, 1, 5] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py index 26d63826cc87a..1c8c89d13abc7 100644 --- a/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py +++ b/python/paddle/fluid/tests/unittests/white_list/op_threshold_white_list.py @@ -46,6 +46,7 @@ 'cudnn_lstm', \ 'rnn', \ 'lgamma', \ + 'sparse_attention', \ 'svd', \ 'matrix_power', \ 'solve', \ diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index ec8602ec7e672..ea88a89e68224 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -65,7 +65,7 @@ def transpile(self, startup_program, main_program, rank, endpoints, self.main_program = default_main_program() self.nranks = len(endpoints) - if self.nranks == 1 and self.mode != "single_process_multi_thread": + if self.nranks == 1 and self.mode != "single_process_multi_thread" and self.mode != "box": raise ValueError('the number of endpoints must > 1') if rank < 0: @@ -441,9 +441,14 @@ class MultiThread(GradAllReduce): ''' ''' - def __init__(self, nrings=1): + def __init__(self, nrings=1, trans_mode="all_reduce"): GradAllReduce.__init__(self, nrings) - self.mode = "single_process_multi_thread" + self.mode = "box" + self.trans_mode = trans_mode + self.fuse_grad_size_in_num = 128 + gpu_nums = os.getenv("FLAGS_selected_gpus", + "0,1,2,3,4,5,6,7,8").split(",") + self.gpu_num = len(gpu_nums) def _transpile_startup_program(self): if len(self.endpoints) > 1: @@ -460,3 +465,259 @@ def _transpile_startup_program(self): print("begin to _transpile_startup_program for single-node") block = self.startup_program.global_block() block.append_op(type='c_comm_init_all', attrs={'ring_id': 0}) + + def _transpile_main_program(self): + self._insert_scale_loss_grad_ops() + if self.trans_mode == "all_gather": + print("begin to transpile in all-gather mode") + self.allgather_ranks = self.nranks * self.gpu_num + self._insert_allgather_ops() + self._update_adam_ops() + elif self.trans_mode == "fuse_all_reduce": + print("begin to transpile in fuse all-reduce mode") + self._insert_fuse_allreduce_ops() + else: + print("begin to transpile in all-reduce mode") + self._insert_allreduce_ops() + + def _insert_allgather_ops(self): + """ + insert allgather op to the main_program + """ + block = self.main_program.global_block() + ring_id = -1 + grad = None + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0 + + offset = idx + for i in range(0, len(op_role_var), 2): + param = block.vars[op_role_var[i]] + new_grad_var = block.create_var( + name=op_role_var[i] + "_allgather", + shape=[self.allgather_ranks] + list(param.shape), + persistable=False, + dtype=core.VarDesc.VarType.FP32, + stop_gradient=True) + grad = block.vars[op_role_var[i + 1]] + if param.is_distributed: # no need to care: used in PLSC + continue + + if offset == idx: + offset += 1 + block._insert_op( + offset, + type='c_sync_calc_stream', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={self.op_role_key: OpRole.Backward}) + offset += 1 + + # As we search ops reversedly, we should insert c_allgather + # op in the same way to keep the ring_id alternate + ring_id = (ring_id + 1) % self.nrings + block._insert_op( + offset, + type='c_allgather', + inputs={'X': grad}, + outputs={'Out': new_grad_var}, + attrs={ + 'nranks': self.allgather_ranks, + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + + if grad is None: + return + + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for ring_id in range(self.nrings): + block._insert_op( + idx + ring_id, + type='c_sync_comm_stream', + inputs={'X': grad}, + outputs={'Out': grad}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + break + + def _update_adam_ops(self): + """ + remove the original adam op, and add new adam ops + """ + block = self.main_program.global_block() + + for idx, op in reversed(list(enumerate(block.ops))): + if self._is_optimizer_op(op): + offset = idx + if op.type != 'adam' and op.type != 'lamb': # filter out scale op + continue + param_name = op.input("Param")[0] + inputs = { + "Param": block.vars[op.input("Param")[0]], + "LearningRate": block.vars[op.input("LearningRate")[0]], + "Moment1": block.vars[op.input("Moment1")[0]], + "Moment2": block.vars[op.input("Moment2")[0]], + "Beta1Pow": block.vars[op.input("Beta1Pow")[0]], + "Beta2Pow": block.vars[op.input("Beta2Pow")[0]] + } + outputs = { + "ParamOut": block.vars[op.output("ParamOut")[0]], + "Moment1Out": block.vars[op.output("Moment1Out")[0]], + "Moment2Out": block.vars[op.output("Moment2Out")[0]], + "Beta1PowOut": block.vars[op.output("Beta1PowOut")[0]], + "Beta2PowOut": block.vars[op.output("Beta2PowOut")[0]] + } + attrs = { + "epsilon": op.attr('epsilon'), + "beta1": op.attr('beta1'), + "beta2": op.attr('beta2'), + "lazy_mode": op.attr('lazy_mode'), + "min_row_size_to_use_multithread": + op.attr('min_row_size_to_use_multithread') + } + split_vars = [ + block.create_var( + name=param_name + "_" + str(i), + shape=block.vars[op.input("Param")[0]].shape, + persistable=False, + dtype=core.VarDesc.VarType.FP32, + stop_gradient=True) for i in range(self.allgather_ranks) + ] + block._insert_op( + offset, + type="split", + inputs={ + 'X': block.vars[op.input("Param")[0] + "_allgather"] + }, + outputs={'Out': split_vars}, + attrs={'num': self.allgather_ranks, + 'axis': 0}) + offset += 1 + + for i in range(self.allgather_ranks): + inputs["Grad"] = split_vars[i] + block._insert_op( + offset, + type=op.type, + inputs=inputs, + outputs=outputs, + attrs=attrs) + offset += 1 + # remove the original adam op + block._remove_op(offset) + + def _insert_fuse_allreduce_ops(self): + """ + insert coalesce_tensor and all reduce ops + """ + block = self.main_program.global_block() + ring_id = 0 % self.nrings + grad = None + param_grads = [] + # find all grad params + for op in reversed(block.ops): + if self._is_backward_op(op) and \ + self.op_role_var_key in op.attr_names: + op_role_var = op.all_attrs()[self.op_role_var_key] + if len(op_role_var) == 0: + continue + assert len(op_role_var) % 2 == 0, "vars need to be one param var followed by one grad var, " \ + "but got odd number of vars" + for i in range(0, len(op_role_var), 2): + param_name = op_role_var[i] + param = block.var(param_name) + grad_name = op_role_var[i + 1] + grad = block.var(grad_name) + if param.is_distributed: + continue + param_grads.append(grad) + if grad is None: + return + + segments = [] + last_dtype = None + # split the grad based on dtype and fused size + for var in param_grads: + if len(segments) == 0 \ + or len(segments[-1]) == self.fuse_grad_size_in_num \ + or var.dtype != last_dtype: + segments.append([var]) + last_dtype = var.dtype + else: + segments[-1].append(var) + + fused_vars = [] + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for segment in segments: + # insert coalesce tensor + tmp_var = block.create_var( + name=unique_name.generate('FusedOutput_{}'.format( + segment[0].name)), + dtype=segment[0].dtype, + persistable=False, + stop_gradient=True) + fused_vars.append(tmp_var) + block._insert_op( + idx, + type="coalesce_tensor", + inputs={"Input": segment}, + outputs={"Output": segment, + "FusedOutput": tmp_var}, + attrs={ + "copy_data": True, + "use_align": True, + "dtype": segment[0].dtype, + self.op_role_key: OpRole.Backward + }) + break + + # insert the allreduce_sum op + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + for fused_var in fused_vars: + block._insert_op( + idx, + type='c_allreduce_sum', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': False, + self.op_role_key: OpRole.Backward + }) + block._insert_op( + idx, + type='c_sync_calc_stream', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={self.op_role_key: OpRole.Backward}) + break + + if len(fused_vars) == 0: + block._sync_with_cpp() + return + + # insert the sync comm op + for idx, op in enumerate(block.ops): + if self._is_optimizer_op(op): + block._insert_op( + idx, + type='c_sync_comm_stream', + inputs={'X': fused_vars[0]}, + outputs={'Out': fused_vars[0]}, + attrs={ + 'ring_id': ring_id, + self.op_role_key: OpRole.Backward + }) + break + block._sync_with_cpp() diff --git a/python/paddle/linalg.py b/python/paddle/linalg.py index d57d9a4bdb678..726355379e7b6 100644 --- a/python/paddle/linalg.py +++ b/python/paddle/linalg.py @@ -14,6 +14,7 @@ from .tensor.linalg import cholesky # noqa: F401 from .tensor.linalg import norm # noqa: F401 +from .tensor.linalg import eig # noqa: F401 from .tensor.linalg import cond # noqa: F401 from .tensor.linalg import matrix_power # noqa: F401 from .tensor.linalg import solve # noqa: F401 @@ -32,6 +33,7 @@ 'norm', 'cond', 'inv', + 'eig', 'eigvals', 'multi_dot', 'matrix_rank', diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index 7965b362b9c55..4151f25b94aff 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -112,6 +112,8 @@ from ...fluid.layers import gather_tree # noqa: F401 from ...fluid.layers import temporal_shift # noqa: F401 +from .sparse_attention import sparse_attention + __all__ = [ #noqa 'conv1d', 'conv1d_transpose', @@ -207,4 +209,5 @@ 'layer_norm', 'instance_norm', 'class_center_sample', + 'sparse_attention', ] diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index da2d010c323b5..b1db45ad50669 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1668,12 +1668,13 @@ def cross_entropy(input, format(invalid_label[0], 0)) # TODO: Temporarily use paddle.nonzero instead of paddle.max # to detect and find out possible illegal label values - if len(paddle.nonzero(valid_label >= input.shape[-1])) > 0: + if len(paddle.nonzero(valid_label >= input.shape[axis])) > 0: invalid_label = paddle.gather_nd( - valid_label, paddle.nonzero(valid_label >= input.shape[-1])) + valid_label, + paddle.nonzero(valid_label >= input.shape[axis])) raise ValueError( "Target({}) is out of class_dimension's upper bound({})". - format(invalid_label[0], input.shape[-1] - 1)) + format(invalid_label[0], input.shape[axis] - 1)) _, out = _C_ops.softmax_with_cross_entropy( input, label, 'soft_label', soft_label, 'ignore_index', @@ -1700,19 +1701,28 @@ def cross_entropy(input, out = _C_ops.elementwise_mul(out, weight_gather_reshape) else: - if input.shape[-1] != weight.shape[-1]: + if input.shape[axis] != weight.shape[-1]: raise ValueError( - "input's class_dimension({}) must equal to \ - weight's class_dimension({}) \ - when weight is provided" - .format(input.shape[-1], weight.shape[-1])) + "input's class_dimension({}) must equal to " + "weight's class_dimension({}) " + "when weight is provided"\ + .format(input.shape[axis], weight.shape[-1])) ignore_weight_mask = paddle.cast((label != ignore_index), out.dtype) if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ - -1] == 1: - ignore_weight_mask.squeeze_(-1) - weight_gather = _C_ops.gather_nd(weight, valid_label) + axis] == 1: + # TODO: Temporarily use squeeze instead of squeeze_ + ignore_weight_mask = paddle.squeeze(ignore_weight_mask, + axis) + if axis != -1 and axis != valid_label.ndim - 1: + temp_perm = list(range(axis % valid_label.ndim)) \ + + list(range((axis % valid_label.ndim + 1) , valid_label.ndim)) \ + + [axis % valid_label.ndim] + weight_gather = _C_ops.gather_nd( + weight, valid_label.transpose(temp_perm)) + else: + weight_gather = _C_ops.gather_nd(weight, valid_label) weight_gather = _C_ops.elementwise_mul(weight_gather, ignore_weight_mask) input_shape = list(label.shape) @@ -1807,20 +1817,27 @@ def cross_entropy(input, weight_gather_reshape = reshape(weight_gather, shape=out_shape) out = paddle.cast(out, weight_gather_reshape.dtype) else: - if input.shape[-1] != weight.shape[-1]: - raise ValueError("input's class_dimension({}) must equal to "\ - "weight's class_dimension({}) "\ - "when weight is provided" - .format(input.shape[-1], weight.shape[-1])) + if input.shape[axis] != weight.shape[-1]: + raise ValueError("input's class_dimension({}) must equal to " + "weight's class_dimension({}) " + "when weight is provided"\ + .format(input.shape[axis], weight.shape[-1])) valid_label = paddle.where(label == ignore_index, paddle.zeros_like(label), label) ignore_weight_mask = paddle.cast((label != ignore_index), input.dtype) if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[ - -1] == 1: - ignore_weight_mask = paddle.squeeze(ignore_weight_mask, -1) - weight_gather = paddle.gather_nd(weight, valid_label) + axis] == 1: + ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis) + if axis != -1 and axis != valid_label.ndim - 1: + temp_perm = list(range(axis % valid_label.ndim)) \ + + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \ + + [axis % valid_label.ndim] + weight_gather = paddle.gather_nd( + weight, paddle.transpose(valid_label, temp_perm)) + else: + weight_gather = paddle.gather_nd(weight, valid_label) weight_gather = paddle.multiply(weight_gather, ignore_weight_mask) input_shape = list(label.shape) diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py new file mode 100644 index 0000000000000..f57669f11457f --- /dev/null +++ b/python/paddle/nn/functional/sparse_attention.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +import paddle +from ...fluid.framework import in_dygraph_mode, default_main_program +from paddle.fluid.layer_helper import LayerHelper +from ...fluid.framework import in_dygraph_mode +from paddle import _C_ops + + +def sparse_attention(query, + key, + value, + sparse_csr_offset, + sparse_csr_columns, + name=None): + r""" + This operator sparsify the Attention matrix in Transformer module + to achieve the effect of reducing memory consumption and computation. + The sparse layout is expressed in CSR format and contains two parameters, + ``offset`` and ``columns``. + + .. math:: + + result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V + + where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. + The dimensions of the three parameters are the same. + ``d`` represents the size of the last dimension of the three parameters. + + Parameters: + query(Tensor): The query tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + key(Tensor): The key tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + value(Tensor): The value tensor in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + sparse_csr_offset(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the offset represents + the number of non-zero elements in each row of the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len + 1]`. + The dtype should be ``int32``. + sparse_csr_columns(Tensor): The sparsity feature in the Attention module + is expressed in the CSR format, and the columns represent + the column index values of non-zero elements in the matrix. + It's a 3-D tensor with a shape of + :math:`[batch\_size, num\_heads, sparse\_nnz]`. + The dtype should be ``int32``. + name(str, optional): The default value is None. Normally there is no need for user + to set this property. For more information, please refer to + :ref:`api_guide_Name`. + + Returns: + A Tensor which refers to the result in the Attention module. + It's a 4-D tensor with a shape of + :math:`[batch\_size, num\_heads, seq\_len, head\_dim]`. + The dtype can be ``float32`` and ``float64``. + + Examples: + .. code-block:: python + + # required: skiptest + import paddle + import numpy as np + + query_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + key_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + value_data = np.array([[[[0, 1,], [2, 3], + [ 0, 1], [2, 3]]]]).astype("float32") + sparse_csr_offset_data = np.array([[[0, 2, + 4, 6, 8]]]).astype("int32") + sparse_csr_columns_data = np.array([[[0, 1, + 0, 1, 2, 3, 2, 3]]]).astype("int32") + print(query_data.shape) + # (1, 1, 4, 2) + print(sparse_csr_offset_data.shape) + # (1, 1, 5) + print(sparse_csr_columns_data.shape) + # (1, 1, 8) + paddle.disable_static() + query = paddle.to_tensor(query_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + key = paddle.to_tensor(key_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + value = paddle.to_tensor(value_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, + place=paddle.CUDAPlace(0)) + output = paddle.nn.functional.sparse_attention(query, key, + value, offset, columns) + print(output) + + # [[[[1.60885942, 2.60885954], + # [1.99830270, 2.99830270], + # [1.60885942, 2.60885954], + # [1.99830270, 2.99830270]]]] + """ + if in_dygraph_mode(): + result_attention, result_sdd, result_softmax = _C_ops.sparse_attention( + query, key, value, sparse_csr_offset, sparse_csr_columns) + return result_attention + + helper = LayerHelper('sparse_attention', **locals()) + dtype = helper.input_dtype(input_param_name='Q') + out = helper.create_variable_for_type_inference(dtype) + result_sdd = helper.create_variable_for_type_inference(dtype) + result_softmax = helper.create_variable_for_type_inference(dtype) + inputs = { + 'Q': query, + 'K': key, + 'V': value, + 'Offset': sparse_csr_offset, + 'Columns': sparse_csr_columns + } + outputs = { + 'Out': out, + 'SparseDotSdd': result_sdd, + 'Softmax': result_softmax + } + helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs) + return out diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py index 10d6af651777e..f26ee80d0af60 100644 --- a/python/paddle/optimizer/adamw.py +++ b/python/paddle/optimizer/adamw.py @@ -171,9 +171,9 @@ def __init__(self, self._lr_to_coeff = dict() if lr_ratio is not None: assert isinstance(lr_ratio, Callable) - if core.is_compiled_with_xpu() or core.is_compiled_with_npu(): + if not core.is_compiled_with_cuda(): raise NotImplementedError( - "'lr_ratio' is unimplemented in XPU and NPU") + "'lr_ratio' is unimplemented in CPU, XPU and NPU") self._lr_ratio = lr_ratio super(AdamW, self).__init__( @@ -298,14 +298,14 @@ def _append_optimize_op(self, block, param_and_grad): _beta2 = self._beta2 if not isinstance( self._beta2, Variable) else self._beta2.numpy().item(0) - _, _, _, _, _, _ = _C_ops.adam( + _, _, _, _, _, _ = _C_ops.adamw( param_and_grad[0], param_and_grad[1], lr, moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0], moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight, 'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode, 'min_row_size_to_use_multithread', 1000, 'beta1', _beta1, 'beta2', _beta2, 'coeff', self._coeff, 'multi_precision', - find_master) + find_master, 'lr_ratio', lr_ratio_) return None diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 0f463b0c7d941..20af4158df48f 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -43,6 +43,7 @@ from ..fluid.framework import cpu_places # noqa: F401 from ..fluid.framework import cuda_places # noqa: F401 from ..fluid.framework import xpu_places # noqa: F401 +from ..fluid.framework import npu_places # noqa: F401 from ..fluid.framework import Variable # noqa: F401 from ..fluid.layers.control_flow import Print # noqa: F401 from ..fluid.layers.nn import py_func # noqa: F401 @@ -99,6 +100,7 @@ 'cpu_places', 'cuda_places', 'xpu_places', + 'npu_places', 'Variable', 'create_global_var', 'accuracy', diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py index 02b34bb21a792..c8f897c21648f 100755 --- a/python/paddle/tensor/__init__.py +++ b/python/paddle/tensor/__init__.py @@ -45,6 +45,7 @@ from .linalg import bmm # noqa: F401 from .linalg import histogram # noqa: F401 from .linalg import mv # noqa: F401 +from .linalg import eig # noqa: F401 from .linalg import matrix_power # noqa: F401 from .linalg import eigvals # noqa: F401 from .linalg import multi_dot # noqa: F401 @@ -104,6 +105,7 @@ from .manipulation import unbind # noqa: F401 from .manipulation import roll # noqa: F401 from .manipulation import chunk # noqa: F401 +from .manipulation import tensordot # noqa: F401 from .math import abs # noqa: F401 from .math import acos # noqa: F401 from .math import asin # noqa: F401 @@ -345,6 +347,7 @@ 'slice', 'split', 'chunk', + 'tensordot', 'squeeze', 'squeeze_', 'stack', @@ -386,7 +389,9 @@ 'bitwise_xor', 'bitwise_not', 'broadcast_tensors', + 'eig', 'uniform_', + 'multi_dot', 'solve', ] diff --git a/python/paddle/tensor/fft.py b/python/paddle/tensor/fft.py index 98ca858c0eb85..20fd143589fa4 100644 --- a/python/paddle/tensor/fft.py +++ b/python/paddle/tensor/fft.py @@ -21,30 +21,7 @@ from ..fluid.data_feeder import check_variable_and_dtype from ..fluid.layer_helper import LayerHelper -__all__ = [ - 'fft', - 'fft2', - 'fftn', - 'ifft', - 'ifft2', - 'ifftn', - 'rfft', - 'rfft2', - 'rfftn', - 'irfft', - 'irfft2', - 'irfftn', - 'hfft', - 'hfft2', - 'hfftn', - 'ihfft', - 'ihfft2', - 'ihfftn', - 'fftfreq', - 'rfftfreq', - 'fftshift', - 'ifftshift', -] +__all__ = [] def _check_normalization(norm): @@ -362,7 +339,7 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None): xp = paddle.to_tensor(x) irfft_xp = paddle.fft.irfft(xp).numpy() print(irfft_xp) - # [0. 0. 0. 4.] + # [0. 1. 0. 0.] """ return fft_c2r(x, n, axis, norm, forward=False, name=name) @@ -500,7 +477,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None): import numpy as np import paddle - x = x = np.mgrid[:4, :4, :4][1] + x = np.mgrid[:4, :4, :4][1] xp = paddle.to_tensor(x) fftn_xp = paddle.fft.fftn(xp, axes=(1, 2)).numpy() print(fftn_xp) @@ -654,9 +631,9 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None): # use axes(2, 0) print(paddle.fft.rfftn(x, axes=(2, 0))) # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True, - # [[[(24+0j), 0j , 0j ], - # [0j , 0j , 0j ], - # [0j , 0j , 0j ]], + # [[[(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ], + # [(8+0j), 0j , 0j ]], # # [[0j , 0j , 0j ], # [0j , 0j , 0j ], @@ -1135,7 +1112,24 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None): refer to :ref:`api_guide_Name` . Returns: - out(Tensor) : The result of the inverse real 2-D FFT. + out(Tensor) : The result of the inverse hermitian 2-D FFT. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + + x = np.mgrid[:5, :5][0].astype(np.float64) + xp = paddle.to_tensor(x) + ihfft2_xp = paddle.fft.ihfft2(xp).numpy() + print(ihfft2_xp) + # [[ 2. +0.j 0. +0.j 0. +0.j ] + # [-0.5-0.68819096j 0. +0.j 0. +0.j ] + # [-0.5-0.16245985j 0. +0.j 0. +0.j ] + # [-0.5+0.16245985j 0. +0.j 0. +0.j ] + # [-0.5+0.68819096j 0. +0.j 0. +0.j ]] """ _check_at_least_ndim(x, 2) if s is not None: @@ -1273,9 +1267,8 @@ def fftshift(x, axes=None, name=None): import paddle x = np.array([3, 1, 2, 2, 3], dtype=float) - scalar_temp = 0.3 n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp) + fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) res = paddle.fft.fftshift(fftfreq_xp).numpy() print(res) # [-1.3333334 -0.6666667 0. 0.6666667 1.3333334] @@ -1317,9 +1310,8 @@ def ifftshift(x, axes=None, name=None): import paddle x = np.array([3, 1, 2, 2, 3], dtype=float) - scalar_temp = 0.3 n = x.size - fftfreq_xp = paddle.fft.fftfreq(n, d=scalar_temp) + fftfreq_xp = paddle.fft.fftfreq(n, d=0.3) res = paddle.fft.ifftshift(fftfreq_xp).numpy() print(res) # [ 1.3333334 -1.3333334 -0.6666667 0. 0.6666667] @@ -1346,7 +1338,7 @@ def fft_c2c(x, n, axis, norm, forward, name): x = paddle.cast(x, _real_to_complex_dtype(x.dtype)) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) @@ -1376,7 +1368,7 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name): if is_interger(x): x = paddle.cast(x, paddle.get_default_dtype()) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) @@ -1415,7 +1407,7 @@ def fft_c2r(x, n, axis, norm, forward, name): elif is_floating_point(x): x = paddle.cast(x, _real_to_complex_dtype(x.dtype)) _check_normalization(norm) - axis = axis or -1 + axis = axis if axis is not None else -1 _check_fft_axis(x, axis) axes = [axis] axes = _normalize_axes(x, axes) diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py index 6c898f2d607c9..f112603fbb60f 100644 --- a/python/paddle/tensor/linalg.py +++ b/python/paddle/tensor/linalg.py @@ -23,6 +23,7 @@ from paddle.common_ops_import import core from paddle.common_ops_import import VarDesc from paddle import _C_ops +import paddle __all__ = [] @@ -448,7 +449,7 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None): format(axis)) -def dist(x, y, p=2): +def dist(x, y, p=2, name=None): r""" This OP returns the p-norm of (x - y). It is not a norm in a strict sense, only as a measure @@ -551,8 +552,8 @@ def cond(x, p=None, name=None): Computes the condition number of a matrix or batches of matrices with respect to a matrix norm ``p``. Args: - x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions - for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``. + x (Tensor): The input tensor could be tensor of shape ``(*, m, n)`` where ``*`` is zero or more batch dimensions + for ``p`` in ``(2, -2)``, or of shape ``(*, n, n)`` where every matrix is invertible for any supported ``p``. And the input data type could be ``float32`` or ``float64``. p (float|string, optional): Order of the norm. Supported values are `fro`, `nuc`, `1`, `-1`, `2`, `-2`, `inf`, `-inf`. Default value is `None`, meaning that the order of the norm is `2`. @@ -607,7 +608,7 @@ def cond(x, p=None, name=None): # out_minus_inf.numpy() [1.] a = paddle.to_tensor(np.random.randn(2, 4, 4).astype('float32')) - # a.numpy() + # a.numpy() # [[[ 0.14063153 -0.996288 0.7996131 -0.02571543] # [-0.16303636 1.5534962 -0.49919784 -0.04402903] # [-1.1341571 -0.6022629 0.5445269 0.29154757] @@ -975,8 +976,8 @@ def t(input, name=None): return out check_variable_and_dtype( - input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'], - 'transpose') + input, 'input', ['float16', 'float32', 'float64', 'int32', + 'int64'], 'transpose') helper = LayerHelper('t', **locals()) out = helper.create_variable_for_type_inference(input.dtype) @@ -1108,17 +1109,17 @@ def matrix_rank(x, tol=None, hermitian=False, name=None): r""" Computes the rank of a matrix. - The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False, + The rank of a matrix is the number of singular values that are greater than the specified `tol` threshold when hermitian=False, or the number of eigenvalues in absolute value that are greater than the specified `tol` threshold when hermitian=True. Args: - x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch - of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64. - tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest - singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed + x (Tensor): The input tensor. Its shape should be `[..., m, n]`, where `...` is zero or more batch dimensions. If `x` is a batch + of matrices then the output has the same batch dimensions. The data type of `x` should be float32 or float64. + tol (float,Tensor,optional): the tolerance value. Default: None. If `tol` is not specified, and `sigma` is the largest + singular value (or eigenvalues in absolute value), and `eps` is the epsilon value for the dtype of `x`, then `tol` is computed with formula `tol=sigma * max(m,n) * eps`. Note that if `x` is a batch of matrices, `tol` is computed this way for every batch. - hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian, - enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use + hermitian (bool,optional): indicates whether `x` is Hermitian. Default: False. When hermitian=True, `x` is assumed to be Hermitian, + enabling a more efficient method for finding eigenvalues, but `x` is not checked inside the function. Instead, We just use the lower triangular of the matrix to compute. name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -1225,7 +1226,7 @@ def bmm(x, y, name=None): #output value: #[[[6.0, 6.0],[12.0, 12.0]],[[45.0, 45.0],[60.0, 60.0]]] out_np = out.numpy() - + """ x_shape = x.shape y_shape = y.shape @@ -1251,7 +1252,7 @@ def bmm(x, y, name=None): return out -def histogram(input, bins=100, min=0, max=0): +def histogram(input, bins=100, min=0, max=0, name=None): """ Computes the histogram of a tensor. The elements are sorted into equal width bins between min and max. If min and max are both zero, the minimum and maximum values of the data are used. @@ -1351,7 +1352,7 @@ def __check_input(x, vec): return out -def det(x): +def det(x, name=None): """ Calculates determinant value of a square matrix or batches of square matrices. Args: @@ -1360,20 +1361,20 @@ def det(x): Returns: y (Tensor):the determinant value of a square matrix or batches of square matrices. - Example: + Examples: .. code-block:: python import paddle x = paddle.randn([3,3,3]) - A = paddle.det(x) + A = paddle.linalg.det(x) print(A) - + # [ 0.02547996, 2.52317095, -6.15900707]) - + """ if in_dygraph_mode(): return core.ops.determinant(x) @@ -1399,11 +1400,11 @@ def det(x): return out -def slogdet(x): +def slogdet(x, name=None): """ Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant. The determinant can be computed with ``sign * exp(logabsdet) - + Supports input of float, double Note that for matrices that have zero determinant, this returns ``(0, -inf)`` @@ -1415,17 +1416,17 @@ def slogdet(x): y (Tensor): A tensor containing the sign of the determinant and the natural logarithm of the absolute value of determinant, respectively. - Example: + Examples: .. code-block:: python import paddle x = paddle.randn([3,3,3]) - A = paddle.slogdet(x) + A = paddle.linalg.slogdet(x) print(A) - + # [[ 1. , 1. , -1. ], # [-0.98610914, -0.43010661, -0.10872950]]) @@ -1461,19 +1462,19 @@ def svd(x, full_matrices=False, name=None): Let :math:`X` be the input matrix or a batch of input matrices, the output should satisfies: .. math:: - X = U * diag(S) * VT - + X = U * diag(S) * VT + Args: x (Tensor): The input tensor. Its shape should be `[..., N, M]`, where `...` is zero or more batch dimensions. N and M can be arbitraty - positive number. Note that if x is sigular matrices, the grad is numerical - instable. The data type of x should be float32 or float64. - full_matrices (bool): A flag to control the behavor of svd. - If full_matrices = True, svd op will compute full U and V matrics, + positive number. Note that if x is sigular matrices, the grad is numerical + instable. The data type of x should be float32 or float64. + full_matrices (bool): A flag to control the behavor of svd. + If full_matrices = True, svd op will compute full U and V matrics, which means shape of U is `[..., N, N]`, shape of V is `[..., M, M]`. K = min(M, N). - If full_matrices = False, svd op will use a economic method to store U and V. + If full_matrices = False, svd op will use a economic method to store U and V. which means shape of U is `[..., N, K]`, shape of V is `[..., M, K]`. K = min(M, N). - name (str, optional): Name for the operation (optional, default is None). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: @@ -1497,9 +1498,9 @@ def svd(x, full_matrices=False, name=None): print (vh) #VT= [[ 0.51411221, 0.85772294], # [ 0.85772294, -0.51411221]] - + # one can verify : U * S * VT == X - # U * UH == I + # U * UH == I # V * VH == I """ @@ -1526,7 +1527,7 @@ def svd(x, full_matrices=False, name=None): def matrix_power(x, n, name=None): r""" Computes the n-th power of a square matrix or a batch of square matrices. - + Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be an exponent, the equation should be: @@ -1563,17 +1564,17 @@ def matrix_power(x, n, name=None): x = paddle.to_tensor([[1, 2, 3], [1, 4, 9], [1, 8, 27]], dtype='float64') - print(paddle.matrix_power(x, 2)) + print(paddle.linalg.matrix_power(x, 2)) # [[6. , 34. , 102.], # [14. , 90. , 282.], # [36. , 250., 804.]] - print(paddle.matrix_power(x, 0)) + print(paddle.linalg.matrix_power(x, 0)) # [[1., 0., 0.], # [0., 1., 0.], # [0., 0., 1.]] - print(paddle.matrix_power(x, -2)) + print(paddle.linalg.matrix_power(x, -2)) # [[ 12.91666667, -12.75000000, 2.83333333 ], # [-7.66666667 , 8. , -1.83333333 ], # [ 1.80555556 , -1.91666667 , 0.44444444 ]] @@ -1593,30 +1594,96 @@ def matrix_power(x, n, name=None): return out +def eig(x, name=None): + """ + This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices. + + .. note:: + If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster. + If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead. + If the matrix is of any shape, please use :ref:`paddle.linalg.svd`. + This API is only supported on CPU device. + The output datatype is always complex for both real and complex input. + + Args: + x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``, + ``float64``, ``compplex64`` or ``complex128``. + name (str, optional): The default value is `None`. Normally there is no need for user to set + this property. For more information, please refer to :ref:`api_guide_Name`. + + Returns: + Eigenvalues(Tensors): A tensor with shape math:`[*, N]` refers to the eigen values. + Eigenvectors(Tensors): A tensor with shape math:`[*, N, N]` refers to the eigen vectors. + + Examples: + .. code-block:: python + + import paddle + import numpy as np + + paddle.device.set_device("cpu") + + x_data = np.array([[1.6707249, 7.2249975, 6.5045543], + [9.956216, 8.749598, 6.066444 ], + [4.4251957, 1.7983172, 0.370647 ]]).astype("float32") + x = paddle.to_tensor(x_data) + w, v = paddle.linalg.eig(x) + print(w) + # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False, + # [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) , + # (0.18518077798279986+0j)], + # [(-0.8308237755993192+0j) , (0.3463813401919749+0j) , + # (-0.6837005269141947+0j) ], + # [(-0.23142567697893396+0j), (0.4944999840400175+0j) , + # (0.7058765252952796+0j) ]]) + + print(v) + # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False, + # [ (16.50471283351188+0j) , (-5.5034820550763515+0j) , + # (-0.21026087843552282+0j)]) + """ + if in_dygraph_mode(): + w, v = _C_ops.eig(x) + return w, v + + check_variable_and_dtype( + x, 'X', ['float32', 'float64', 'complex64', 'complex128'], 'eig') + helper = LayerHelper('eig', **locals()) + + w = helper.create_variable_for_type_inference(x.dtype) + v = helper.create_variable_for_type_inference(x.dtype) + + inputs = {'X': x} + outputs = {'Eigenvalues': w, 'Eigenvectors': v} + helper.append_op(type='eig', inputs=inputs, outputs=outputs) + + return w, v + + def eigvals(x, name=None): """ Compute the eigenvalues of one or more general matrices. - - Warning: - The gradient kernel of this operator does not yet developed. + + Warning: + The gradient kernel of this operator does not yet developed. If you need back propagation through this operator, please replace it with paddle.linalg.eig. Args: x (Tensor): A square matrix or a batch of square matrices whose eigenvalues will be computed. - Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. + Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. Its data type should be float32, float64, complex64, or complex128. - name (str, optional): Name for the operation (optional, default is None). + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. Returns: - Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. + Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`. The eigenvalues are complex-valued even when `x` is real. Examples: .. code-block:: python import paddle - + paddle.set_device("cpu") paddle.seed(1234) @@ -1630,8 +1697,8 @@ def eigvals(x, name=None): """ check_variable_and_dtype(x, 'dtype', - ['float32', 'float64', 'complex64', 'complex128'], - 'eigvals') + ['float32', 'float64', 'complex64', + 'complex128'], 'eigvals') x_shape = list(x.shape) if len(x_shape) < 2: @@ -1657,7 +1724,7 @@ def multi_dot(x, name=None): """ Multi_dot is an operator that calculates multiple matrix multiplications. - Supports inputs of float, double and float16 dtypes. This function does not + Supports inputs of float16(only GPU support), float32 and float64 dtypes. This function does not support batched inputs. The input tensor in [x] must be 2-D except for the first and last can be 1-D. @@ -1699,7 +1766,7 @@ def multi_dot(x, name=None): B_data = np.random.random([4, 5]).astype(np.float32) A = paddle.to_tensor(A_data) B = paddle.to_tensor(B_data) - out = paddle.multi_dot([A, B]) + out = paddle.linalg.multi_dot([A, B]) print(out.numpy().shape) # [3, 5] @@ -1710,7 +1777,7 @@ def multi_dot(x, name=None): A = paddle.to_tensor(A_data) B = paddle.to_tensor(B_data) C = paddle.to_tensor(C_data) - out = paddle.multi_dot([A, B, C]) + out = paddle.linalg.multi_dot([A, B, C]) print(out.numpy().shape) # [10, 7] @@ -1735,7 +1802,7 @@ def multi_dot(x, name=None): def eigh(x, UPLO='L', name=None): """ - Compute the eigenvalues and eigenvectors of a + Compute the eigenvalues and eigenvectors of a complex Hermitian (conjugate symmetric) or a real symmetric matrix. Args: @@ -1804,7 +1871,7 @@ def __check_input(x, UPLO): def pinv(x, rcond=1e-15, hermitian=False, name=None): r""" - Calculate pseudo inverse via SVD(singular value decomposition) + Calculate pseudo inverse via SVD(singular value decomposition) of one matrix or batches of regular matrix. .. math:: @@ -1815,30 +1882,30 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): else: x = u * s * ut (eigh) out = u * 1/s * u.conj().transpose(-2,-1) - + If x is hermitian or symmetric matrix, svd will be replaced with eigh. Args: - x(Tensor): The input tensor. Its shape should be (*, m, n) - where * is zero or more batch dimensions. m and n can be - arbitraty positive number. The data type of x should be + x(Tensor): The input tensor. Its shape should be (*, m, n) + where * is zero or more batch dimensions. m and n can be + arbitraty positive number. The data type of x should be float32 or float64 or complex64 or complex128. When data type is complex64 or cpmplex128, hermitian should be set True. - rcond(Tensor, optional): the tolerance value to determine - when is a singular value zero. Defalut:1e-15. - - hermitian(bool, optional): indicates whether x is Hermitian + rcond(Tensor, optional): the tolerance value to determine + when is a singular value zero. Defalut:1e-15. + + hermitian(bool, optional): indicates whether x is Hermitian if complex or symmetric if real. Default: False. - - name(str|None): A name for this layer(optional). If set None, + + name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. - + Returns: - Tensor: The tensor with same data type with x. it represents + Tensor: The tensor with same data type with x. it represents pseudo inverse of x. Its shape should be (*, n, m). - + Examples: .. code-block:: python @@ -1998,8 +2065,8 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None): helper = LayerHelper('pinv', **locals()) dtype = x.dtype check_variable_and_dtype( - x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], - 'pinv') + x, 'dtype', ['float32', 'float64', 'complex64', + 'complex128'], 'pinv') if dtype == paddle.complex128: s_type = 'float64' @@ -2079,40 +2146,40 @@ def solve(x, y, name=None): Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'. Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be a vector/matrix or a batch of vectors/matrices, the equation should be: - + .. math:: Out = X^-1 * Y Specifically, - This system of linear equations has one solution if and only if input 'X' is invertible. - + Args: x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or more batch dimensions. Its data type should be float32 or float64. y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or more batch dimensions. Its data type should be float32 or float64. - name(str, optional): Name for the operation (optional, default is None). + name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. - + Returns: - Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'. + Tensor: The solution of a square system of linear equations with a unique solution for input 'x' and 'y'. Its data type should be the same as that of `x`. - + Examples: .. code-block:: python - + # a square system of linear equations: # 2*X0 + X1 = 9 # X0 + 2*X1 = 8 - + import paddle import numpy as np - + np_x = np.array([[3, 1],[1, 2]]) np_y = np.array([9, 8]) x = paddle.to_tensor(np_x, dtype="float64") y = paddle.to_tensor(np_y, dtype="float64") out = paddle.linalg.solve(x, y) - + print(out) # [2., 3.]) """ diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py index 4129a1060daf9..5f7588cb2a9a0 100644 --- a/python/paddle/tensor/manipulation.py +++ b/python/paddle/tensor/manipulation.py @@ -2173,3 +2173,211 @@ def strided_slice(x, axes, starts, ends, strides, name=None): return paddle.fluid.layers.strided_slice( input=x, axes=axes, starts=starts, ends=ends, strides=strides) + + +def tensordot(x, y, axes=2, name=None): + r""" + This function computes a contraction, which sum the product of elements from two tensors along the given axes. + + Args: + x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``. + y (Tensor): The right tensor for contraction with the same data type as ``x``. + axes (int|tuple|list|Tensor, optional): The axes to contract for ``x`` and ``y``, defaulted to integer ``2``. + + 1. It could be a non-negative integer ``n``, + in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order. + + 2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. + For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``. + + 3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. + When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. + When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. + When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored. + + 4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list + and applied the same rules described above to determine the contraction axes. + Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode. + name(str, optional): The default value is None. Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name` . + + Return: + Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. + In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted. + + NOTES: + 1. This function supports tensor broadcast, + the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules. + 2. This function also supports axes expansion, + when the two given axis sequences for ``x`` and ``y`` are of different lengths, + the shorter sequence will expand the same axes as the longer one at the end. + For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], + the axis sequence for ``x`` is [0, 1, 2, 3], + while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3]. + + Examples: + .. code-block:: python + + import paddle + + data_type = 'float64' + + # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product. + # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases. + x = paddle.arange(4, dtype=data_type).reshape([2, 2]) + y = paddle.arange(4, dtype=data_type).reshape([2, 2]) + z = paddle.tensordot(x, y, axes=0) + # z = [[[[0., 0.], + # [0., 0.]], + # + # [[0., 1.], + # [2., 3.]]], + # + # + # [[[0., 2.], + # [4., 6.]], + # + # [[0., 3.], + # [6., 9.]]]] + + + # For two 1-d tensor x and y, the case axes=1 is equivalent to inner product. + x = paddle.arange(10, dtype=data_type) + y = paddle.arange(10, dtype=data_type) + z1 = paddle.tensordot(x, y, axes=1) + z2 = paddle.dot(x, y) + # z1 = z2 = [285.] + + + # For two 2-d tensor x and y, the case axes=1 is equivalent to matrix multiplication. + x = paddle.arange(6, dtype=data_type).reshape([2, 3]) + y = paddle.arange(12, dtype=data_type).reshape([3, 4]) + z1 = paddle.tensordot(x, y, axes=1) + z2 = paddle.matmul(x, y) + # z1 = z2 = [[20., 23., 26., 29.], + # [56., 68., 80., 92.]] + + + # When axes is a 1-d int list, x and y will be contracted along the same given axes. + # Note that axes=[1, 2] is equivalent to axes=[[1, 2]], axes=[[1, 2], []], axes=[[1, 2], [1]], and axes=[[1, 2], [1, 2]]. + x = paddle.arange(24, dtype=data_type).reshape([2, 3, 4]) + y = paddle.arange(36, dtype=data_type).reshape([3, 3, 4]) + z = paddle.tensordot(x, y, axes=[1, 2]) + # z = [[506. , 1298., 2090.], + # [1298., 3818., 6338.]] + + + # When axes is a list containing two 1-d int list, the first will be applied to x and the second to y. + x = paddle.arange(60, dtype=data_type).reshape([3, 4, 5]) + y = paddle.arange(24, dtype=data_type).reshape([4, 3, 2]) + z = paddle.tensordot(x, y, axes=([1, 0], [0, 1])) + # z = [[4400., 4730.], + # [4532., 4874.], + # [4664., 5018.], + # [4796., 5162.], + # [4928., 5306.]] + + + # Thanks to the support of axes expansion, axes=[[0, 1, 3, 4], [1, 0, 3, 4]] can be abbreviated as axes= [[0, 1, 3, 4], [1, 0]]. + x = paddle.arange(720, dtype=data_type).reshape([2, 3, 4, 5, 6]) + y = paddle.arange(720, dtype=data_type).reshape([3, 2, 4, 5, 6]) + z = paddle.tensordot(x, y, axes=[[0, 1, 3, 4], [1, 0]]) + # z = [[23217330., 24915630., 26613930., 28312230.], + # [24915630., 26775930., 28636230., 30496530.], + # [26613930., 28636230., 30658530., 32680830.], + # [28312230., 30496530., 32680830., 34865130.]] + """ + op_type = 'tensordot' + input_dtype = ['float32', 'float64'] + + check_variable_and_dtype(x, 'x', input_dtype, op_type) + check_variable_and_dtype(y, 'y', input_dtype, op_type) + check_type(axes, 'axes', (int, tuple, list, Variable), op_type) + + def _var_to_list(var): + if in_dygraph_mode(): + return tolist(var) + raise TypeError( + "The 'axes' with type 'Tensor' in " + op_type + + " is not available in static graph mode, " + "please convert its type to int|Tuple|List, or use dynamic graph mode." + ) + + axes_x = [] + axes_y = [] + if np.issubdtype(type(axes), np.integer): + assert axes >= 0, ( + "The 'axes' in " + op_type + + f" should not be negative, but received axes={axes}.") + axes_x = range(x.ndim - axes, x.ndim) + axes_y = range(axes) + else: + if isinstance(axes, Variable): + axes = _var_to_list(axes) + + if not axes or np.issubdtype(type(axes[0]), np.integer): + axes_x = axes + else: + axes_x = axes[0] + if len(axes) > 1: + axes_y = axes[1] + + if isinstance(axes_x, Variable): + axes_x = _var_to_list(axes_x) + if isinstance(axes_y, Variable): + axes_y = _var_to_list(axes_y) + + axes_x, axes_y = list(axes_x), list(axes_y) + len_axes_x, len_axes_y = len(axes_x), len(axes_y) + if len_axes_x < len_axes_y: + axes_x.extend(axes_y[len_axes_x:]) + elif len_axes_y < len_axes_x: + axes_y.extend(axes_x[len_axes_y:]) + + shape_x, shape_y = list(x.shape), list(y.shape) + need_contracted_dim_x = np.zeros((x.ndim), dtype=bool) + need_contracted_dim_y = np.zeros((y.ndim), dtype=bool) + contraction_size = 1 + for i in range(len(axes_x)): + dim_x, dim_y = axes_x[i], axes_y[i] + sx, sy = shape_x[dim_x], shape_y[dim_y] + if sx == 1: + shape_y[dim_y] = 1 + y = y.sum(dim_y).reshape(shape_y) + elif sy == 1: + shape_x[dim_x] = 1 + x = x.sum(dim_x).reshape(shape_x) + else: + assert sx == sy, "The dimensional size for 'x' and 'y' in " + op_type + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}." + + need_contracted_dim_x[dim_x] = True + need_contracted_dim_y[dim_y] = True + contraction_size *= shape_x[dim_x] + + perm_x = [] + perm_y = [] + shape_out = [] + not_contraction_size_x = 1 + not_contraction_size_y = 1 + for i in range(x.ndim): + if not need_contracted_dim_x[i]: + perm_x.append(i) + shape_out.append(shape_x[i]) + not_contraction_size_x *= shape_x[i] + perm_x.extend(axes_x) + perm_y.extend(axes_y) + for i in range(y.ndim): + if not need_contracted_dim_y[i]: + perm_y.append(i) + shape_out.append(shape_y[i]) + not_contraction_size_y *= shape_y[i] + + if not shape_out: + shape_out = [1] + + x = x.transpose(perm=perm_x).reshape( + [not_contraction_size_x, contraction_size]) + y = y.transpose(perm=perm_y).reshape( + [contraction_size, not_contraction_size_y]) + out = x.matmul(y).reshape(shape_out) + return out diff --git a/python/paddle/tests/test_dlpack.py b/python/paddle/tests/test_dlpack.py index 2880901d1ad16..3a3f748bb991e 100644 --- a/python/paddle/tests/test_dlpack.py +++ b/python/paddle/tests/test_dlpack.py @@ -22,6 +22,7 @@ class TestDLPack(unittest.TestCase): def test_dlpack_dygraph(self): + paddle.disable_static() tensor = paddle.to_tensor(np.array([1, 2, 3, 4]).astype('int')) dlpack = paddle.utils.dlpack.to_dlpack(tensor) out_from_dlpack = paddle.utils.dlpack.from_dlpack(dlpack) @@ -31,6 +32,15 @@ def test_dlpack_dygraph(self): np.array(out_from_dlpack), np.array([1, 2, 3, 4]).astype( 'int'))) + def test_dlpack_tensor_larger_than_2dim(self): + paddle.disable_static() + numpy_data = np.random.randn(4, 5, 6) + t = paddle.to_tensor(numpy_data) + # TODO: There may be a reference count problem of to_dlpack. + dlpack = paddle.utils.dlpack.to_dlpack(t) + out = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertTrue(np.allclose(numpy_data, out.numpy())) + def test_dlpack_static(self): paddle.enable_static() tensor = fluid.create_lod_tensor( @@ -57,6 +67,37 @@ def test_dlpack_static(self): np.array(gout_from_dlpack), np.array([[1], [2], [3], [4]]).astype('int'))) + def test_dlpack_dtype_conversion(self): + paddle.disable_static() + # DLpack does not explicitly support bool data type. + dtypes = [ + "float16", + "float32", + "float64", + "int8", + "int16", + "int32", + "int64", + "uint8", + ] + data = np.ones((2, 3, 4)) + for dtype in dtypes: + x = paddle.to_tensor(data, dtype=dtype) + dlpack = paddle.utils.dlpack.to_dlpack(x) + o = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.dtype, o.dtype) + self.assertTrue(np.allclose(x.numpy(), o.numpy())) + + complex_dtypes = ["complex64", "complex128"] + for dtype in complex_dtypes: + x = paddle.to_tensor( + [[1 + 6j, 2 + 5j, 3 + 4j], [4 + 3j, 5 + 2j, 6 + 1j]], + dtype=dtype) + dlpack = paddle.utils.dlpack.to_dlpack(x) + o = paddle.utils.dlpack.from_dlpack(dlpack) + self.assertEqual(x.dtype, o.dtype) + self.assertTrue(np.allclose(x.numpy(), o.numpy())) + class TestRaiseError(unittest.TestCase): def test_from_dlpack_raise_type_error(self): diff --git a/python/paddle/tests/test_ops_roi_align.py b/python/paddle/tests/test_ops_roi_align.py new file mode 100644 index 0000000000000..4a37831a0ccf2 --- /dev/null +++ b/python/paddle/tests/test_ops_roi_align.py @@ -0,0 +1,108 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.vision.ops import roi_align, RoIAlign + + +class TestRoIAlign(unittest.TestCase): + def setUp(self): + self.data = np.random.rand(1, 256, 32, 32).astype('float32') + boxes = np.random.rand(3, 4) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + self.boxes = boxes.astype('float32') + self.boxes_num = np.array([3], dtype=np.int32) + + def roi_align_functional(self, output_size): + if isinstance(output_size, int): + output_shape = (3, 256, output_size, output_size) + else: + output_shape = (3, 256, output_size[0], output_size[1]) + + if paddle.in_dynamic_mode(): + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + align_out = roi_align( + data, boxes, boxes_num=boxes_num, output_size=output_size) + np.testing.assert_equal(align_out.shape, output_shape) + + else: + data = paddle.static.data( + shape=self.data.shape, dtype=self.data.dtype, name='data') + boxes = paddle.static.data( + shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes') + boxes_num = paddle.static.data( + shape=self.boxes_num.shape, + dtype=self.boxes_num.dtype, + name='boxes_num') + + align_out = roi_align( + data, boxes, boxes_num=boxes_num, output_size=output_size) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + align_out = exe.run(paddle.static.default_main_program(), + feed={ + 'data': self.data, + 'boxes': self.boxes, + 'boxes_num': self.boxes_num + }, + fetch_list=[align_out]) + + np.testing.assert_equal(align_out[0].shape, output_shape) + + def test_roi_align_functional_dynamic(self): + self.roi_align_functional(3) + self.roi_align_functional(output_size=(3, 4)) + + def test_roi_align_functional_static(self): + paddle.enable_static() + self.roi_align_functional(3) + paddle.disable_static() + + def test_RoIAlign(self): + roi_align_c = RoIAlign(output_size=(4, 3)) + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + align_out = roi_align_c(data, boxes, boxes_num) + np.testing.assert_equal(align_out.shape, (3, 256, 4, 3)) + + def test_value(self, ): + data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4, + 4).astype(np.float32) + boxes = np.array( + [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32) + boxes_num = np.array([2]).astype(np.int32) + output = np.array([[[[6.]]], [[[9.75]]]], dtype=np.float32) + + data = paddle.to_tensor(data) + boxes = paddle.to_tensor(boxes) + boxes_num = paddle.to_tensor(boxes_num) + + roi_align_c = RoIAlign(output_size=1) + align_out = roi_align_c(data, boxes, boxes_num) + np.testing.assert_almost_equal(align_out.numpy(), output) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/tests/test_ops_roi_pool.py b/python/paddle/tests/test_ops_roi_pool.py new file mode 100644 index 0000000000000..3c84a55da1ea6 --- /dev/null +++ b/python/paddle/tests/test_ops_roi_pool.py @@ -0,0 +1,109 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np + +import paddle +from paddle.vision.ops import roi_pool, RoIPool + + +class TestRoIPool(unittest.TestCase): + def setUp(self): + self.data = np.random.rand(1, 256, 32, 32).astype('float32') + boxes = np.random.rand(3, 4) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + self.boxes = boxes.astype('float32') + self.boxes_num = np.array([3], dtype=np.int32) + + def roi_pool_functional(self, output_size): + + if isinstance(output_size, int): + output_shape = (3, 256, output_size, output_size) + else: + output_shape = (3, 256, output_size[0], output_size[1]) + + if paddle.in_dynamic_mode(): + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + pool_out = roi_pool( + data, boxes, boxes_num=boxes_num, output_size=output_size) + np.testing.assert_equal(pool_out.shape, output_shape) + + else: + data = paddle.static.data( + shape=self.data.shape, dtype=self.data.dtype, name='data') + boxes = paddle.static.data( + shape=self.boxes.shape, dtype=self.boxes.dtype, name='boxes') + boxes_num = paddle.static.data( + shape=self.boxes_num.shape, + dtype=self.boxes_num.dtype, + name='boxes_num') + + pool_out = roi_pool( + data, boxes, boxes_num=boxes_num, output_size=output_size) + + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + + pool_out = exe.run(paddle.static.default_main_program(), + feed={ + 'data': self.data, + 'boxes': self.boxes, + 'boxes_num': self.boxes_num + }, + fetch_list=[pool_out]) + + np.testing.assert_equal(pool_out[0].shape, output_shape) + + def test_roi_pool_functional_dynamic(self): + self.roi_pool_functional(3) + self.roi_pool_functional(output_size=(3, 4)) + + def test_roi_pool_functional_static(self): + paddle.enable_static() + self.roi_pool_functional(3) + paddle.disable_static() + + def test_RoIPool(self): + roi_pool_c = RoIPool(output_size=(4, 3)) + data = paddle.to_tensor(self.data) + boxes = paddle.to_tensor(self.boxes) + boxes_num = paddle.to_tensor(self.boxes_num) + + pool_out = roi_pool_c(data, boxes, boxes_num) + np.testing.assert_equal(pool_out.shape, (3, 256, 4, 3)) + + def test_value(self, ): + data = np.array([i for i in range(1, 17)]).reshape(1, 1, 4, + 4).astype(np.float32) + boxes = np.array( + [[1., 1., 2., 2.], [1.5, 1.5, 3., 3.]]).astype(np.float32) + boxes_num = np.array([2]).astype(np.int32) + output = np.array([[[[11.]]], [[[16.]]]], dtype=np.float32) + + data = paddle.to_tensor(data) + boxes = paddle.to_tensor(boxes) + boxes_num = paddle.to_tensor(boxes_num) + + roi_pool_c = RoIPool(output_size=1) + pool_out = roi_pool_c(data, boxes, boxes_num) + np.testing.assert_almost_equal(pool_out.numpy(), output) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/utils/dlpack.py b/python/paddle/utils/dlpack.py index ca2a1ae0e19ec..01611be3ea56f 100644 --- a/python/paddle/utils/dlpack.py +++ b/python/paddle/utils/dlpack.py @@ -28,7 +28,9 @@ def to_dlpack(x): Encodes a tensor to DLPack. Args: - x (Tensor): A tensor, and the data type is bool, float32, float64, int32, int64. + x (Tensor): The input tensor, and the data type can be `bool`, `float16`, `float32`, + `float64`, `int8`, `int16`, `int32`, `int64`, `uint8`, `complex64`, + `complex128`. Returns: dltensor, and the data type is PyCapsule. @@ -51,19 +53,9 @@ def to_dlpack(x): "The type of 'x' in to_dlpack must be paddle.Tensor," " but received {}.".format(type(x))) - dtype = convert_dtype(x.dtype) - - if dtype not in ['bool', 'int32', 'int64', 'float32', 'float64']: - raise TypeError( - "the dtype of 'x' in to_dlpack must be any of [bool, int32, int64, " - "float32, float64], but received {}.".format(dtype)) - return x.value().get_tensor()._to_dlpack() check_type(x, 'x', (LoDTensor), 'to_dlpack') - check_dtype(x._dtype(), 'x', - ['bool', 'int32', 'int64', 'float32', 'float64'], 'to_dlpack') - return x._to_dlpack() @@ -75,7 +67,9 @@ def from_dlpack(dlpack): dlpack (PyCapsule): a PyCapsule object with the dltensor. Returns: - out (Tensor): a tensor decoded from DLPack. + out (Tensor): a tensor decoded from DLPack. One thing to be noted, if we get + an input dltensor with data type as `bool`, we return the decoded + tensor as `uint8`. Examples: .. code-block:: python diff --git a/python/paddle/utils/install_check.py b/python/paddle/utils/install_check.py index 69baa4facfa96..efdc6847f0056 100644 --- a/python/paddle/utils/install_check.py +++ b/python/paddle/utils/install_check.py @@ -74,7 +74,22 @@ def _is_cuda_available(): return False -def _run_dygraph_single(use_cuda): +def _is_npu_available(): + """ + Check whether NPU is avaiable. + """ + try: + assert len(paddle.static.npu_places()) > 0 + return True + except Exception as e: + logging.warning( + "You are using NPU version PaddlePaddle, but there is no NPU " + "detected on your machine. Maybe NPU devices is not set properly." + "\n Original Error is {}".format(e)) + return False + + +def _run_dygraph_single(use_cuda, use_npu): """ Testing the simple network in dygraph mode using one CPU/GPU. @@ -84,6 +99,8 @@ def _run_dygraph_single(use_cuda): paddle.disable_static() if use_cuda: paddle.set_device('gpu') + elif use_npu: + paddle.set_device('npu') else: paddle.set_device('cpu') weight_attr = paddle.ParamAttr( @@ -102,7 +119,7 @@ def _run_dygraph_single(use_cuda): opt.step() -def _run_static_single(use_cuda): +def _run_static_single(use_cuda, use_npu): """ Testing the simple network with executor running directly, using one CPU/GPU. @@ -119,8 +136,14 @@ def _run_static_single(use_cuda): param_grads = paddle.static.append_backward( out, parameter_list=[weight.name])[0] - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(train_prog, feed={input.name: _prepare_data(1)}, @@ -128,7 +151,7 @@ def _run_static_single(use_cuda): paddle.disable_static() -def _run_static_parallel(use_cuda, device_list): +def _run_static_parallel(use_cuda, use_npu, device_list): """ Testing the simple network in data parallel mode, using multiple CPU/GPU. @@ -150,8 +173,15 @@ def _run_static_parallel(use_cuda, device_list): train_prog).with_data_parallel( loss_name=loss.name, places=device_list) - exe = paddle.static.Executor( - paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()) + if use_cuda: + place = paddle.CUDAPlace(0) + elif use_npu: + place = paddle.NPUPlace(0) + compiled_prog = train_prog + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) exe.run(startup_prog) exe.run(compiled_prog, feed={input.name: _prepare_data(len(device_list))}, @@ -182,23 +212,31 @@ def run_check(): if paddle.is_compiled_with_cuda(): use_cuda = _is_cuda_available() + use_npu = False + elif paddle.is_compiled_with_npu(): + use_npu = _is_npu_available() + use_cuda = False else: + use_npu = False use_cuda = False if use_cuda: device_str = "GPU" device_list = paddle.static.cuda_places() + elif use_npu: + device_str = "NPU" + device_list = paddle.static.npu_places() else: device_str = "CPU" device_list = paddle.static.cpu_places(device_count=2) device_count = len(device_list) - _run_static_single(use_cuda) - _run_dygraph_single(use_cuda) + _run_static_single(use_cuda, use_npu) + _run_dygraph_single(use_cuda, use_npu) print("PaddlePaddle works well on 1 {}.".format(device_str)) try: - _run_static_parallel(use_cuda, device_list) + _run_static_parallel(use_cuda, use_npu, device_list) print("PaddlePaddle works well on {} {}s.".format(device_count, device_str)) print( diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py index d5e73f977b563..965cf8b55e793 100644 --- a/python/paddle/vision/ops.py +++ b/python/paddle/vision/ops.py @@ -29,7 +29,13 @@ 'deform_conv2d', 'DeformConv2D', 'read_file', - 'decode_jpeg' + 'decode_jpeg', + 'roi_pool', + 'RoIPool', + 'psroi_pool', + 'PSRoIPool', + 'roi_align', + 'RoIAlign', ] @@ -900,3 +906,394 @@ def decode_jpeg(x, mode='unchanged', name=None): type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out}) return out + + +def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): + """ + Position sensitive region of interest pooling (also known as PSROIPooling) is to perform + position-sensitive average pooling on regions of interest specified by input. It performs + on inputs of nonuniform sizes to obtain fixed-size feature maps. + + PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details. + + Args: + x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64. + boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be + a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], + (x1, y1) is the top left coordinates, and (x2, y2) is the bottom + right coordinates. + boxes_num (Tensor): The number of boxes contained in each picture in the batch. + output_size (int|Tuple(int, int)) The pooled output size(H, W), data type + is int32. If int, H and W are both equal to output_size. + spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their + input scale to the scale used when pooling. Default: 1.0 + name(str, optional): The default value is None. + Normally there is no need for user to set this property. + For more information, please refer to :ref:`api_guide_Name` + + Returns: + 4-D Tensor. The pooled ROIs with shape (num_rois, output_channels, pooled_h, pooled_w). + The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input. + + Examples: + .. code-block:: python + + import paddle + x = paddle.uniform([2, 490, 28, 28], dtype='float32') + boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32') + boxes_num = paddle.to_tensor([1, 2], dtype='int32') + pool_out = paddle.vision.ops.psroi_pool(x, boxes, boxes_num, 7, 1.0) + """ + + check_type(output_size, 'output_size', (int, tuple, list), 'psroi_pool') + if isinstance(output_size, int): + output_size = (output_size, output_size) + pooled_height, pooled_width = output_size + assert (len(x.shape) == 4, + "Input features with shape should be (N, C, H, W)") + output_channels = int(x.shape[1] / (pooled_height * pooled_width)) + if in_dygraph_mode(): + return core.ops.psroi_pool(x, boxes, boxes_num, "output_channels", + output_channels, "spatial_scale", + spatial_scale, "pooled_height", + pooled_height, "pooled_width", pooled_width) + + helper = LayerHelper('psroi_pool', **locals()) + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='psroi_pool', + inputs={'X': x, + 'ROIs': boxes}, + outputs={'Out': out}, + attrs={ + 'output_channels': output_channels, + 'spatial_scale': spatial_scale, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width + }) + return out + + +class PSRoIPool(Layer): + """ + This interface is used to construct a callable object of the ``PSRoIPool`` class. Please + refer to :ref:`api_paddle_vision_ops_psroi_pool`. + + Args: + output_size (int|Tuple(int, int)) The pooled output size(H, W), data type + is int32. If int, H and W are both equal to output_size. + spatial_scale (float): Multiplicative spatial scale factor to translate ROI coords from their + input scale to the scale used when pooling. Default: 1.0. + + Shape: + - x: 4-D Tensor with shape (N, C, H, W). + - boxes: 2-D Tensor with shape (num_rois, 4). + - boxes_num: 1-D Tensor. + - output: 4-D tensor with shape (num_rois, output_channels, pooled_h, pooled_w). + The output_channels equal to C / (pooled_h * pooled_w), where C is the channels of input. + + Returns: + None + + Examples: + .. code-block:: python + + import paddle + + psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0) + x = paddle.uniform([2, 490, 28, 28], dtype='float32') + boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32') + boxes_num = paddle.to_tensor([1, 2], dtype='int32') + pool_out = psroi_module(x, boxes, boxes_num) + + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(PSRoIPool, self).__init__() + self.output_size = output_size + self.spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num): + return psroi_pool(x, boxes, boxes_num, self.output_size, + self.spatial_scale) + + +def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None): + """ + This operator implements the roi_pooling layer. + Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7). + The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer + For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn. + + Args: + x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, W is weight. + The data type is float32 or float64. + boxes (Tensor): boxes (Regions of Interest) to pool over. + 2D-Tensor with the shape of [num_boxes,4]. + Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, + and (x2, y2) is the bottom right coordinates. + boxes_num (Tensor): the number of RoIs in each image, data type is int32. Default: None + output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0 + name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default. + + Returns: + pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]]. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import roi_pool + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + pool_out = roi_pool(data, boxes, boxes_num=boxes_num, output_size=3) + assert pool_out.shape == [3, 256, 3, 3], '' + """ + + check_type(output_size, 'output_size', (int, tuple), 'roi_pool') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + if in_dygraph_mode(): + assert boxes_num is not None, "boxes_num should not be None in dygraph mode." + pool_out, argmaxes = core.ops.roi_pool( + x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width", + pooled_width, "spatial_scale", spatial_scale) + return pool_out + + else: + check_variable_and_dtype(x, 'x', ['float32'], 'roi_pool') + check_variable_and_dtype(boxes, 'boxes', ['float32'], 'roi_pool') + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + + inputs = { + "X": x, + "ROIs": boxes, + } + if boxes_num is not None: + inputs['RoisNum'] = boxes_num + helper.append_op( + type="roi_pool", + inputs=inputs, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out + + +class RoIPool(Layer): + """ + This interface is used to construct a callable object of the `RoIPool` class. Please + refer to :ref:`api_paddle_vision_ops_roi_pool`. + + Args: + output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0. + + Returns: + pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]]. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import RoIPool + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + roi_pool = RoIPool(output_size=(4, 3)) + pool_out = roi_pool(data, boxes, boxes_num) + assert pool_out.shape == [3, 256, 4, 3], '' + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIPool, self).__init__() + self._output_size = output_size + self._spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num): + return roi_pool( + x=x, + boxes=boxes, + boxes_num=boxes_num, + output_size=self._output_size, + spatial_scale=self._spatial_scale) + + def extra_repr(self): + main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}' + return main_str.format(**self.__dict__) + + +def roi_align(x, + boxes, + boxes_num, + output_size, + spatial_scale=1.0, + sampling_ratio=-1, + aligned=True, + name=None): + """ + This operator implements the roi_align layer. + Region of Interest (RoI) Align operator (also known as RoI Align) is to + perform bilinear interpolation on inputs of nonuniform sizes to obtain + fixed-size feature maps (e.g. 7*7), as described in Mask R-CNN. + + Dividing each region proposal into equal-sized sections with the pooled_width + and pooled_height. Location remains the origin result. + + In each ROI bin, the value of the four regularly sampled locations are + computed directly through bilinear interpolation. The output is the mean of + four locations. Thus avoid the misaligned problem. + + Args: + x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], + where N is the batch size, C is the input channel, H is Height, + W is weight. The data type is float32 or float64. + boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It + should be a 2-D Tensor of shape (num_boxes, 4). The data type is + float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is + the top left coordinates, and (x2, y2) is the bottom right coordinates. + boxes_num (Tensor): The number of boxes contained in each picture in + the batch, the data type is int32. + output_size (int or Tuple[int, int]): The pooled output size(h, w), data + type is int32. If int, h and w are both equal to output_size. + spatial_scale (float32): Multiplicative spatial scale factor to translate + ROI coords from their input scale to the scale used when pooling. + Default: 1.0 + sampling_ratio (int32): number of sampling points in the interpolation + grid used to compute the output value of each pooled output bin. + If > 0, then exactly ``sampling_ratio x sampling_ratio`` sampling + points per bin are used. + If <= 0, then an adaptive number of grid points are used (computed + as ``ceil(roi_width / output_width)``, and likewise for height). + Default: -1 + aligned (bool): If False, use the legacy implementation. If True, pixel + shift the box coordinates it by -0.5 for a better alignment with the + two neighboring pixel indices. This version is used in Detectron2. + Default: True + name(str, optional): For detailed information, please refer to : + ref:`api_guide_Name`. Usually name is no need to set and None by + default. + + Returns: + Tensor: The output of ROIAlignOp is a 4-D tensor with shape (num_boxes, + channels, pooled_h, pooled_w). The data type is float32 or float64. + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import roi_align + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + align_out = roi_align(data, boxes, boxes_num, output_size=3) + assert align_out.shape == [3, 256, 3, 3] + """ + + check_type(output_size, 'output_size', (int, tuple), 'roi_align') + if isinstance(output_size, int): + output_size = (output_size, output_size) + + pooled_height, pooled_width = output_size + if in_dygraph_mode(): + assert boxes_num is not None, "boxes_num should not be None in dygraph mode." + align_out = core.ops.roi_align( + x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width", + pooled_width, "spatial_scale", spatial_scale, "sampling_ratio", + sampling_ratio, "aligned", aligned) + return align_out + + else: + check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'roi_align') + check_variable_and_dtype(boxes, 'boxes', ['float32', 'float64'], + 'roi_align') + helper = LayerHelper('roi_align', **locals()) + dtype = helper.input_dtype() + align_out = helper.create_variable_for_type_inference(dtype) + inputs = { + "X": x, + "ROIs": boxes, + } + if boxes_num is not None: + inputs['RoisNum'] = boxes_num + helper.append_op( + type="roi_align", + inputs=inputs, + outputs={"Out": align_out}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale, + "sampling_ratio": sampling_ratio, + "aligned": aligned, + }) + return align_out + + +class RoIAlign(Layer): + """ + This interface is used to construct a callable object of the `RoIAlign` class. + Please refer to :ref:`api_paddle_vision_ops_roi_align`. + + Args: + output_size (int or tuple[int, int]): The pooled output size(h, w), + data type is int32. If int, h and w are both equal to output_size. + spatial_scale (float32, optional): Multiplicative spatial scale factor + to translate ROI coords from their input scale to the scale used + when pooling. Default: 1.0 + + Returns: + align_out (Tensor): The output of ROIAlign operator is a 4-D tensor with + shape (num_boxes, channels, pooled_h, pooled_w). + + Examples: + .. code-block:: python + + import paddle + from paddle.vision.ops import RoIAlign + + data = paddle.rand([1, 256, 32, 32]) + boxes = paddle.rand([3, 4]) + boxes[:, 2] += boxes[:, 0] + 3 + boxes[:, 3] += boxes[:, 1] + 4 + boxes_num = paddle.to_tensor([3]).astype('int32') + roi_align = RoIAlign(output_size=(4, 3)) + align_out = roi_align(data, boxes, boxes_num) + assert align_out.shape == [3, 256, 4, 3] + """ + + def __init__(self, output_size, spatial_scale=1.0): + super(RoIAlign, self).__init__() + self._output_size = output_size + self._spatial_scale = spatial_scale + + def forward(self, x, boxes, boxes_num, aligned=True): + return roi_align( + x=x, + boxes=boxes, + boxes_num=boxes_num, + output_size=self._output_size, + spatial_scale=self._spatial_scale, + aligned=aligned) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index 53b5cb9a722c4..6104b168798c9 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -88,7 +88,7 @@ function run_tools_test() { cd ${CUR_PWD} } -changed_env_var_count=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/paddle | grep 'DEFINE_EXPORTED' | wc -l` +changed_env_var_count=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/paddle | grep 'DEFINE_EXPORTED' | grep -v '@@' | wc -l` if [[ $changed_env_var_count -gt 0 ]]; then echo_line="You must have one RD (lanxianghit (Recommend), phlrain or luotao1) approval for changing the FLAGS, which manages the environment variables.\n" check_approval 1 6836917 47554610 43953930 diff --git a/tools/dockerfile/build_scripts/install_cudnn.sh b/tools/dockerfile/build_scripts/install_cudnn.sh index e90a0789a34bd..0817634fa91af 100644 --- a/tools/dockerfile/build_scripts/install_cudnn.sh +++ b/tools/dockerfile/build_scripts/install_cudnn.sh @@ -37,4 +37,12 @@ elif [[ "$1" == "cudnn811" && "$VERSION" == "10.2" ]]; then cp -r lib64 /usr && cd ../ && \ rm -f cudnn-10.2-linux-x64-v8.1.1.33.tgz && \ rm -rf cuda +elif [[ "$1" == "cudnn821" && "$VERSION" == "11.2" ]]; then + wget -q https://paddle-ci.gz.bcebos.com/cudnn/cudnn-11.3-linux-x64-v8.2.1.32.tgz --no-check-certificate + tar -xzf cudnn-11.3-linux-x64-v8.2.1.32.tgz && \ + cd cuda && \ + cp -r include /usr && \ + cp -r lib64 /usr && cd ../ && \ + rm -f cudnn-11.3-linux-x64-v8.2.1.32.tgz && \ + rm -rf cuda fi diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh index 2e7917448f2e2..9e028625de1c3 100644 --- a/tools/dockerfile/build_scripts/install_trt.sh +++ b/tools/dockerfile/build_scripts/install_trt.sh @@ -31,6 +31,11 @@ if [[ "$VERSION" == "10.1" ]];then tar -zxf TensorRT6-cuda10.1-cudnn7.tar.gz -C /usr/local cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/include/* /usr/include/ && cp -rf /usr/local/TensorRT6-cuda10.1-cudnn7/lib/* /usr/lib/ rm TensorRT6-cuda10.1-cudnn7.tar.gz +elif [[ "$1" == "trt8034" && "$VERSION" == "11.2" ]];then + wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz --no-check-certificate + tar -zxf TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz -C /usr/local + cp -rf /usr/local/TensorRT-8.0.3.4/include/* /usr/include/ && cp -rf /usr/local/TensorRT-8.0.3.4/lib/* /usr/lib/ + rm TensorRT-8.0.3.4.Linux.x86_64-gnu.cuda-11.3.cudnn8.2.tar.gz elif [[ "$VERSION" == "11.2" ]];then wget -q https://paddle-ci.gz.bcebos.com/TRT/TensorRT7-cuda11.1-cudnn8.1.tar.gz --no-check-certificate tar -zxf TensorRT7-cuda11.1-cudnn8.1.tar.gz -C /usr/local diff --git a/tools/dockerfile/centos7_manylinux.sh b/tools/dockerfile/centos7_manylinux.sh index 2435c57d541b0..6038e464097cd 100755 --- a/tools/dockerfile/centos7_manylinux.sh +++ b/tools/dockerfile/centos7_manylinux.sh @@ -84,6 +84,22 @@ function make_cuda112cudnn8() { sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp } +function make_cuda112cudnn821trt8034gcc82() { + sed 's//11.2.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc82 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH \nRUN yum remove -y libcudnn8-devel.x86_64 libcudnn8.x86_64 \nRun bash build_scripts/install_cudnn.sh cudnn821 \nENV CUDNN_VERSION=8.2.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "/install_trt.sh/d" Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_trt.sh trt8034 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + +function make_cuda112cudnn821trt8034gcc54() { + sed 's//11.2.2-cudnn8-devel-centos7/g' Dockerfile.centos >Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_gcc.sh gcc54 \nRUN mv /usr/bin/cc /usr/bin/cc.bak \&\& ln -s /usr/local/gcc-5.4/bin/gcc /usr/bin/cc \nENV PATH=/usr/local/gcc-5.4/bin:\$PATH \nRUN yum remove -y libcudnn8-devel.x86_64 libcudnn8.x86_64 \nRun bash build_scripts/install_cudnn.sh cudnn821 \nENV CUDNN_VERSION=8.2.1 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i "/install_trt.sh/d" Dockerfile.tmp + sed -i "s#RUN bash build_scripts/build.sh#RUN bash build_scripts/install_trt.sh trt8034 \nRUN bash build_scripts/build.sh#g" Dockerfile.tmp + sed -i '/CMD/iRUN ldconfig' Dockerfile.tmp +} + function main() { local CMD=$1 case $CMD in @@ -123,6 +139,12 @@ function main() { cuda112cudnn8) make_cuda112cudnn8 ;; + cuda112cudnn821trt8034gcc82) + make_cuda112cudnn821trt8034gcc82 + ;; + cuda112cudnn821trt8034gcc54) + make_cuda112cudnn821trt8034gcc54 + ;; *) echo "Make dockerfile error, Without this paramet." exit 1 diff --git a/tools/externalError/README.md b/tools/externalError/README.md index 029efd8cb9491..0c2ac626991da 100644 --- a/tools/externalError/README.md +++ b/tools/externalError/README.md @@ -1,9 +1,25 @@ -Usage: +#### **Introduction for crawling new error message:** -Please run: -``` -bash start.sh -``` -If you want to update all external error message, you need to run command `bash start.sh` in current directory, -and upload the generated file `externalErrorMsg.tar.gz` to https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz + +1. add new spider code in spider.py for crawling error message from website. + +2. run `bash start.sh` in current directory to generate new externalErrorMsg_${date}.tar.gz file, for example `externalErrorMsg_20210928.tar.gz`. + +3. upload above tar file into bos https://paddlepaddledeps.bj.bcebos.com **paddlepaddledeps** bucket, and copy download link `${download_url}`. ***\*Be careful not to delete original tar file\****. + +4. compute md5 value of above tar file `${md5}`, and modify cmake/third_party.cmake file + + ``` + set(URL "${download_url}" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 ${md5}) + ``` + + for example: + + ``` + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) + ``` + +5. commit your changes, and create pull request. diff --git a/tools/externalError/spider.py b/tools/externalError/spider.py index a74d82f40ebeb..e07f05f561cb5 100644 --- a/tools/externalError/spider.py +++ b/tools/externalError/spider.py @@ -17,8 +17,10 @@ import urllib.request import json import collections -import sys, getopt +import sys +import getopt import external_error_pb2 +from html.parser import HTMLParser def parsing(externalErrorDesc): @@ -335,6 +337,31 @@ def parsing(externalErrorDesc): _Messages.message = "'%s'. %s" % (error[0], m_message) print("End crawling errorMessage for nvidia NCCL API!\n") + #*************************************************************************************************# + #*********************************** CUFFT Error Message **************************************# + print("start crawling errorMessage for nvidia CUFFT API--->") + url = 'https://docs.nvidia.com/cuda/cufft/index.html#cufftresult' + + allMessageDesc = externalErrorDesc.errors.add() + allMessageDesc.type = external_error_pb2.CUFFT + + html = urllib.request.urlopen(url).read().decode('utf-8') + + class CUFFTHTMLParser(HTMLParser): + '''CUFFTHTML Parser + ''' + + def handle_data(self, data): + if 'typedef enum cufftResult_t' in data: + for line in data.strip().splitlines()[1:-1]: + status, code, desc = re.split('=|//', line.strip()) + _Messages = allMessageDesc.messages.add() + _Messages.code = int(code.strip(' ,')) + _Messages.message = "'%s'. %s" % (status.strip(), + desc.strip()) + + CUFFTHTMLParser().feed(html) + def main(argv): try: diff --git a/tools/externalError/start.sh b/tools/externalError/start.sh index 32ef63c261268..82715dd47326c 100644 --- a/tools/externalError/start.sh +++ b/tools/externalError/start.sh @@ -32,4 +32,4 @@ fi protobuf/bin/protoc -I../../paddle/fluid/platform/ --python_out . ../../paddle/fluid/platform/external_error.proto python3.7 spider.py -tar czvf externalErrorMsg.tar.gz externalErrorMsg.pb +tar czvf externalErrorMsg_$(date +'%Y%m%d').tar.gz externalErrorMsg.pb diff --git a/tools/get_pr_ut.py b/tools/get_pr_ut.py index bd67d68c13111..0ba6026535307 100644 --- a/tools/get_pr_ut.py +++ b/tools/get_pr_ut.py @@ -139,6 +139,7 @@ def get_is_white_file(self, filename): """ judge is white file in pr's files. """ isWhiteFile = False not_white_files = (PADDLE_ROOT + 'cmake/', PADDLE_ROOT + 'patches/', + PADDLE_ROOT + 'paddle/testing/', PADDLE_ROOT + 'tools/dockerfile/', PADDLE_ROOT + 'tools/windows/', PADDLE_ROOT + 'tools/test_runner.py', From 1d1bd8247a8116d10e2fa305c865d491bee7d9d6 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Tue, 12 Oct 2021 09:41:49 +0000 Subject: [PATCH 4/7] merge --- .../framework/new_executor/workqueue_utils.h | 39 ------------------- 1 file changed, 39 deletions(-) diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index 1aab4529b023c..bb219fea36267 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -18,12 +18,6 @@ #include #include #include -#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \ - defined(__i386__) -#define __WQ_x86__ -#include -#endif -#include #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -70,38 +64,5 @@ void* AlignedMalloc(size_t size, size_t alignment); void AlignedFree(void* memory_ptr); -static inline void CpuRelax() { -#if defined(__WQ_x86__) - _mm_pause(); -#endif -} - -class SpinLock { - public: - void lock() { - for (;;) { - if (!lock_.exchange(true, std::memory_order_acquire)) { - break; - } - constexpr int kMaxLoop = 32; - for (int loop = 1; lock_.load(std::memory_order_relaxed);) { - if (loop <= kMaxLoop) { - for (int i = 1; i <= loop; ++i) { - CpuRelax(); - } - loop *= 2; - } else { - std::this_thread::yield(); - } - } - } - } - - void unlock() { lock_.store(false, std::memory_order_release); } - - private: - std::atomic lock_{false}; -}; - } // namespace framework } // namespace paddle From e206173aa9be7401b83a53581627bfaf557c8fb2 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Fri, 15 Oct 2021 16:29:37 +0000 Subject: [PATCH 5/7] Add EventsWaiter --- .../framework/new_executor/event_count.h | 4 +- .../framework/new_executor/interpretercore.cc | 5 +- .../framework/new_executor/interpretercore.h | 1 + .../new_executor/interpretercore_util.h | 12 ++- .../new_executor/nonblocking_threadpool.h | 30 +++----- .../fluid/framework/new_executor/workqueue.cc | 46 +++++------- .../fluid/framework/new_executor/workqueue.h | 24 ++++-- .../framework/new_executor/workqueue_test.cc | 23 ++++-- .../framework/new_executor/workqueue_utils.cc | 74 +++++++++++++++++++ .../framework/new_executor/workqueue_utils.h | 50 +++++++++++++ 10 files changed, 199 insertions(+), 70 deletions(-) diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h index 0c6d49042d22d..7f1e3670056fc 100644 --- a/paddle/fluid/framework/new_executor/event_count.h +++ b/paddle/fluid/framework/new_executor/event_count.h @@ -50,11 +50,13 @@ #include #include #include -#include "paddle/fluid/framework/new_executor/workqueue_utils.h" namespace paddle { namespace framework { +void* AlignedMalloc(size_t size, size_t alignment); +void AlignedFree(void* memory_ptr); + class EventCount { public: class Waiter; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 083d989cb5267..c8acf8cc5dbb8 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -37,7 +37,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place, main_program_(main_prog), global_scope_(global_scope), stream_analyzer_(place), - async_work_queue_(kHostNumThreads) { + async_work_queue_(kHostNumThreads, main_thread_blocker_) { is_build_ = false; feed_names_ = feed_names; @@ -365,7 +365,8 @@ void InterpreterCore::ExecuteInstructionList( } } - async_work_queue_.WaitEmpty(); + auto event_id = main_thread_blocker_.WaitEvent(); + VLOG(3) << "event_id " << event_id; PADDLE_ENFORCE_EQ( op_run_number_.load(), vec_instr.size(), diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index 47f23aff4f00e..eac3131ca1b31 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -94,6 +94,7 @@ class InterpreterCore { InterpreterProfiler dry_run_profiler_; StreamAnalyzer stream_analyzer_; EventManager event_manager_; + EventsWaiter main_thread_blocker_; interpretercore::AsyncWorkQueue async_work_queue_; InterpreterCoreGarbageCollector gc_; diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 2a5942c712365..3c927a8d81d16 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -33,6 +33,7 @@ #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/workqueue.h" +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -53,16 +54,19 @@ using AtomicVectorSizeT = std::vector>>; class AsyncWorkQueue { public: - explicit AsyncWorkQueue(size_t host_num_threads) + AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter) : host_num_thread_(host_num_threads) { std::vector group_options; // for execute host Kernel group_options.emplace_back(/*num_threads*/ host_num_threads, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, + /*queue_empty_waiter*/ waiter); // for launch device Kernel group_options.emplace_back(/*num_threads*/ 1, - /*allow_spinning*/ true, /*track_task*/ true); + /*allow_spinning*/ true, + /*track_task*/ true, + /*queue_empty_waiter*/ waiter); queue_group_ = CreateWorkQueueGroup(group_options); } @@ -71,7 +75,7 @@ class AsyncWorkQueue { AtomicVectorSizeT& PrepareAtomicVarRef( const std::vector& vec_meta_info); - void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } + // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } void AddTask(const OpFuncType& op_func_type, std::function fn) { queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h index 2997ce1fe2473..667723c67165c 100644 --- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h @@ -19,9 +19,12 @@ namespace paddle { namespace framework { +template class TaskTracker { public: - TaskTracker() : wait_empty_cv_(1) {} + TaskTracker() = default; + + explicit TaskTracker(Notifier& notifier) : notifier_(¬ifier) {} TaskTracker(const TaskTracker&) = delete; @@ -33,32 +36,17 @@ class TaskTracker { void SubCounter() { if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) { - wait_empty_cv_.Notify(true); + if (notifier_ != nullptr) { + notifier_->NotifyEvent(); + } } } - // only one user can wait at any time - void WaitTaskNumToZero() { - bool waiting = false; - if (!wait_empty_.compare_exchange_strong(waiting, true, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { - abort(); - } - EventCount::Waiter* w = wait_empty_cv_.GetWaiter(0); - wait_empty_cv_.Prewait(); - if (num_tasks_.load(std::memory_order_relaxed) == 0) { - wait_empty_cv_.CancelWait(); - } else { - wait_empty_cv_.CommitWait(w); - } - wait_empty_.store(false); - } + uint64_t PendingTaskNum() { return num_tasks_.load(); } private: alignas(64) std::atomic num_tasks_{0}; - alignas(64) EventCount wait_empty_cv_; - alignas(64) std::atomic wait_empty_{false}; + Notifier* notifier_{nullptr}; }; template diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index 8c6eeab4d5c0a..deeed1eb72af2 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -13,13 +13,18 @@ namespace paddle { namespace framework { namespace { +using TaskTracker = TaskTracker; + class WorkQueueImpl : public WorkQueue { public: - explicit WorkQueueImpl(const WorkQueueOptions& options) - : WorkQueue(options), queue_(nullptr), tracker_(nullptr) { - if (options_.track_task) { + explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options) { + if (options_.track_task && options.queue_empty_waiter != nullptr) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - tracker_ = new (storage) TaskTracker; + TaskTracker* tracker = reinterpret_cast(storage); + auto event_id = options.queue_empty_waiter->RegisterEvent( + [tracker]() { return tracker->PendingTaskNum() == 0; }); + auto notifier = options.queue_empty_waiter->GetEventNotifier(event_id); + tracker_ = new (storage) TaskTracker(notifier.get()); } queue_ = new NonblockingThreadPool(options_.num_threads, options_.allow_spinning); @@ -44,20 +49,11 @@ class WorkQueueImpl : public WorkQueue { queue_->AddTask(std::move(fn)); } - void WaitQueueEmpty() override { - if (tracker_ == nullptr) { - PADDLE_THROW( - platform::errors::Unavailable("set WorkQueueOptions.track_task = " - "true before call this interface.")); - } - tracker_->WaitTaskNumToZero(); - } - size_t NumThreads() const override { return queue_->NumThreads(); } private: - NonblockingThreadPool* queue_; - TaskTracker* tracker_; + NonblockingThreadPool* queue_{nullptr}; + TaskTracker* tracker_{nullptr}; }; class WorkQueueGroupImpl : public WorkQueueGroup { @@ -69,8 +65,6 @@ class WorkQueueGroupImpl : public WorkQueueGroup { void AddTask(size_t queue_idx, std::function fn) override; - void WaitQueueGroupEmpty() override; - size_t QueueNumThreads(size_t queue_idx) const override; size_t QueueGroupNumThreads() const override; @@ -92,9 +86,14 @@ WorkQueueGroupImpl::WorkQueueGroupImpl( queues_storage_ = reinterpret_cast(buffer); for (size_t idx = 0; idx < num_queues; ++idx) { const auto& options = queues_options_[idx]; - if (options.track_task && tracker_ == nullptr) { + if (options.track_task && tracker_ == nullptr && + options.queue_empty_waiter != nullptr) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - tracker_ = new (storage) TaskTracker; + TaskTracker* tracker = reinterpret_cast(storage); + auto event_id = options.queue_empty_waiter->RegisterEvent( + [tracker]() { return tracker->PendingTaskNum() == 0; }); + auto notifier = options.queue_empty_waiter->GetEventNotifier(event_id); + tracker_ = new (storage) TaskTracker(notifier.get()); } queues_[idx] = new (&queues_storage_[idx]) NonblockingThreadPool(options.num_threads, options.allow_spinning); @@ -124,15 +123,6 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { queues_[queue_idx]->AddTask(std::move(fn)); } -void WorkQueueGroupImpl::WaitQueueGroupEmpty() { - if (nullptr == tracker_) { - PADDLE_THROW(platform::errors::Unavailable( - "set WorkQueueOptions.track_task = true for at least one of queues " - "before call this interface.")); - } - tracker_->WaitTaskNumToZero(); -} - size_t WorkQueueGroupImpl::QueueNumThreads(size_t queue_idx) const { assert(queue_idx < queues_.size()); return queues_.at(queue_idx)->NumThreads(); diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue.h index ead9d9949b700..e184566f914c3 100644 --- a/paddle/fluid/framework/new_executor/workqueue.h +++ b/paddle/fluid/framework/new_executor/workqueue.h @@ -21,15 +21,29 @@ namespace paddle { namespace framework { +class EventsWaiter; + struct WorkQueueOptions { WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task) : num_threads(num_threads), allow_spinning(allow_spinning), track_task(track_task) {} + WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task, + EventsWaiter* waiter) + : num_threads(num_threads), + allow_spinning(allow_spinning), + track_task(track_task), + queue_empty_waiter(waiter) {} + size_t num_threads; bool allow_spinning; + // If you need to blocking the calling thread to wait "queue empty", set + // track_task = true and set queue_empty_waiter. EventsWaiter::WaitEvent will + // block the calling thread until any of events (including "queue empty") + // occured. bool track_task; + EventsWaiter* queue_empty_waiter{nullptr}; }; class WorkQueue { @@ -44,9 +58,8 @@ class WorkQueue { virtual void AddTask(std::function fn) = 0; - // set WorkQueueOptions.track_task = true before call this - // interface, otherwise will abort() - virtual void WaitQueueEmpty() = 0; + // See WorkQueueOptions.track_task for details + // virtual void WaitQueueEmpty() = 0; virtual size_t NumThreads() const = 0; @@ -67,9 +80,8 @@ class WorkQueueGroup { virtual void AddTask(size_t queue_idx, std::function fn) = 0; - // set WorkQueueOptions.track_task = true for at least one of queues - // before call this interface, otherwise will abort() - virtual void WaitQueueGroupEmpty() = 0; + // See WorkQueueOptions.track_task for details + // virtual void WaitQueueGroupEmpty() = 0; virtual size_t QueueNumThreads(size_t queue_idx) const = 0; diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue_test.cc index c229a84b145ab..51a0b58a3865f 100644 --- a/paddle/fluid/framework/new_executor/workqueue_test.cc +++ b/paddle/fluid/framework/new_executor/workqueue_test.cc @@ -16,18 +16,21 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" TEST(WorkQueue, TestSingleThreadedWorkQueue) { VLOG(1) << "In Test"; using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateSingleThreadedWorkQueue; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kLoopNum = 1000000; // CreateSingleThreadedWorkQueue + EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, events_waiter); auto work_queue = CreateSingleThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 1u); @@ -42,7 +45,7 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) { }); // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - work_queue->WaitQueueEmpty(); + events_waiter.WaitEvent(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum); } @@ -52,13 +55,15 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateMultiThreadedWorkQueue; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; // CreateMultiThreadedWorkQueue + EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, events_waiter); auto work_queue = CreateMultiThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 10u); @@ -75,7 +80,7 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { } // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - work_queue->WaitQueueEmpty(); + events_waiter.WaitEvent(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum); } @@ -84,15 +89,17 @@ TEST(WorkQueue, TestWorkQueueGroup) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueueGroup; using paddle::framework::CreateWorkQueueGroup; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; - // CreateMultiThreadedWorkQueue + // ThreadedWorkQueueGroup + EventsWaiter events_waiter; WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, events_waiter); WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, events_waiter); auto queue_group = CreateWorkQueueGroup({sq_options, mq_options}); // NumThreads EXPECT_EQ(queue_group->QueueNumThreads(0), 1u); @@ -113,6 +120,6 @@ TEST(WorkQueue, TestWorkQueueGroup) { } }); // WaitQueueGroupEmpty() - queue_group->WaitQueueGroupEmpty(); + events_waiter.WaitEvent(); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum); } diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc index 2ea49e676a807..9303588152c31 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.cc +++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc @@ -55,5 +55,79 @@ void AlignedFree(void* mem_ptr) { #endif } +constexpr EventsWaiter::EventId kEmptyEventId = -1; + +EventsWaiter::EventsWaiter() + : trigger_event_(kEmptyEventId), waiting_(false), cv_(1) {} + +EventsWaiter::EventId EventsWaiter::RegisterEvent(EventChecker checker) { + checkers_.push_back(std::move(checker)); + EventId id = checkers_.size() - 1; + Notifier n(id, *this); + notifiers_.push_back(n); + return id; +} + +std::reference_wrapper EventsWaiter::GetEventNotifier( + const EventId& id) { + int64_t event_num = checkers_.size(); + if (id < 0 || id > event_num) { + PADDLE_THROW(platform::errors::OutOfRange("Invalid EventId")); + } + return notifiers_[id]; +} + +EventsWaiter::EventId EventsWaiter::WaitEvent() { + // only one user can wait at any time + bool waiting = false; + if (!waiting_.compare_exchange_strong(waiting, true, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + PADDLE_THROW( + platform::errors::ResourceExhausted("Another thread is waiting.")); + } + EventId id = trigger_event_.load(std::memory_order_acquire); + auto w = cv_.GetWaiter(0); + cv_.Prewait(); + int64_t event_num = checkers_.size(); + for (int64_t i = 0; id == kEmptyEventId && i < event_num; ++i) { + if (checkers_[i]()) { + id = i; + } + } + if (id != kEmptyEventId) { + cv_.CancelWait(); + } else { + cv_.CommitWait(w); + } + trigger_event_.store(kEmptyEventId); + waiting_.store(false); + return id; +} + +const std::set EventsWaiter::WaitEvents() { + std::set ids; + auto trigger_ev = WaitEvent(); + ids.insert(trigger_ev); + for (int64_t i = 0; i < static_cast(checkers_.size()); ++i) { + if (checkers_[i]()) { + ids.insert(i); + } + } + return ids; +} + +void EventsWaiter::SetTriggerEvent(const EventId& id) { + EventId expected_id = kEmptyEventId; + if (!trigger_event_.compare_exchange_strong(expected_id, id, + std::memory_order_seq_cst, + std::memory_order_release)) { + return; + } + cv_.Notify(true); +} + +void EventsWaiter::Notifier::NotifyEvent() { waiter_.SetTriggerEvent(id_); } + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index bb219fea36267..d04bf3bfe3c0c 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -18,6 +18,9 @@ #include #include #include +#include +#include +#include "paddle/fluid/framework/new_executor/event_count.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -64,5 +67,52 @@ void* AlignedMalloc(size_t size, size_t alignment); void AlignedFree(void* memory_ptr); +// A multiplexing waiter, be able to wait multi events simultaneously +// Blocking the calling thread to wait any of the registered events +class EventsWaiter { + public: + using EventId = int64_t; + + using EventChecker = std::function; + + class Notifier { + public: + void NotifyEvent(); + + private: + friend EventsWaiter; + Notifier(EventId id, EventsWaiter& waiter) : id_(id), waiter_(waiter) {} + + EventId id_; + EventsWaiter& waiter_; + }; + + EventsWaiter(); + + EventsWaiter(const EventsWaiter&) = delete; + + EventsWaiter& operator=(const EventsWaiter&) = delete; + + // RegisterEvent must be called before WaitEvent + EventId RegisterEvent(EventChecker checker); + + std::reference_wrapper GetEventNotifier(const EventId& id); + + // Wait any of the registered events + EventId WaitEvent(); + + const std::set WaitEvents(); + + private: + friend Notifier; + void SetTriggerEvent(const EventId& id); + + std::vector checkers_; + std::vector notifiers_; + std::atomic trigger_event_; + std::atomic waiting_; + EventCount cv_; +}; + } // namespace framework } // namespace paddle From 0a3dcd9d0377495d18a440e730318b63f8ec57d1 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Fri, 15 Oct 2021 17:08:27 +0000 Subject: [PATCH 6/7] Revert "Add EventsWaiter" This reverts commit e206173aa9be7401b83a53581627bfaf557c8fb2. --- .../framework/new_executor/event_count.h | 4 +- .../framework/new_executor/interpretercore.cc | 5 +- .../framework/new_executor/interpretercore.h | 1 - .../new_executor/interpretercore_util.h | 12 +-- .../new_executor/nonblocking_threadpool.h | 30 +++++--- .../fluid/framework/new_executor/workqueue.cc | 46 +++++++----- .../fluid/framework/new_executor/workqueue.h | 24 ++---- .../framework/new_executor/workqueue_test.cc | 23 ++---- .../framework/new_executor/workqueue_utils.cc | 74 ------------------- .../framework/new_executor/workqueue_utils.h | 50 ------------- 10 files changed, 70 insertions(+), 199 deletions(-) diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h index 7f1e3670056fc..0c6d49042d22d 100644 --- a/paddle/fluid/framework/new_executor/event_count.h +++ b/paddle/fluid/framework/new_executor/event_count.h @@ -50,13 +50,11 @@ #include #include #include +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" namespace paddle { namespace framework { -void* AlignedMalloc(size_t size, size_t alignment); -void AlignedFree(void* memory_ptr); - class EventCount { public: class Waiter; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index c8acf8cc5dbb8..083d989cb5267 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -37,7 +37,7 @@ InterpreterCore::InterpreterCore(const platform::Place& place, main_program_(main_prog), global_scope_(global_scope), stream_analyzer_(place), - async_work_queue_(kHostNumThreads, main_thread_blocker_) { + async_work_queue_(kHostNumThreads) { is_build_ = false; feed_names_ = feed_names; @@ -365,8 +365,7 @@ void InterpreterCore::ExecuteInstructionList( } } - auto event_id = main_thread_blocker_.WaitEvent(); - VLOG(3) << "event_id " << event_id; + async_work_queue_.WaitEmpty(); PADDLE_ENFORCE_EQ( op_run_number_.load(), vec_instr.size(), diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index eac3131ca1b31..47f23aff4f00e 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -94,7 +94,6 @@ class InterpreterCore { InterpreterProfiler dry_run_profiler_; StreamAnalyzer stream_analyzer_; EventManager event_manager_; - EventsWaiter main_thread_blocker_; interpretercore::AsyncWorkQueue async_work_queue_; InterpreterCoreGarbageCollector gc_; diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 3c927a8d81d16..2a5942c712365 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -33,7 +33,6 @@ #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/workqueue.h" -#include "paddle/fluid/framework/new_executor/workqueue_utils.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -54,19 +53,16 @@ using AtomicVectorSizeT = std::vector>>; class AsyncWorkQueue { public: - AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter) + explicit AsyncWorkQueue(size_t host_num_threads) : host_num_thread_(host_num_threads) { std::vector group_options; // for execute host Kernel group_options.emplace_back(/*num_threads*/ host_num_threads, /*allow_spinning*/ true, - /*track_task*/ true, - /*queue_empty_waiter*/ waiter); + /*track_task*/ true); // for launch device Kernel group_options.emplace_back(/*num_threads*/ 1, - /*allow_spinning*/ true, - /*track_task*/ true, - /*queue_empty_waiter*/ waiter); + /*allow_spinning*/ true, /*track_task*/ true); queue_group_ = CreateWorkQueueGroup(group_options); } @@ -75,7 +71,7 @@ class AsyncWorkQueue { AtomicVectorSizeT& PrepareAtomicVarRef( const std::vector& vec_meta_info); - // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } + void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } void AddTask(const OpFuncType& op_func_type, std::function fn) { queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h index 667723c67165c..2997ce1fe2473 100644 --- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h @@ -19,12 +19,9 @@ namespace paddle { namespace framework { -template class TaskTracker { public: - TaskTracker() = default; - - explicit TaskTracker(Notifier& notifier) : notifier_(¬ifier) {} + TaskTracker() : wait_empty_cv_(1) {} TaskTracker(const TaskTracker&) = delete; @@ -36,17 +33,32 @@ class TaskTracker { void SubCounter() { if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) { - if (notifier_ != nullptr) { - notifier_->NotifyEvent(); - } + wait_empty_cv_.Notify(true); } } - uint64_t PendingTaskNum() { return num_tasks_.load(); } + // only one user can wait at any time + void WaitTaskNumToZero() { + bool waiting = false; + if (!wait_empty_.compare_exchange_strong(waiting, true, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + abort(); + } + EventCount::Waiter* w = wait_empty_cv_.GetWaiter(0); + wait_empty_cv_.Prewait(); + if (num_tasks_.load(std::memory_order_relaxed) == 0) { + wait_empty_cv_.CancelWait(); + } else { + wait_empty_cv_.CommitWait(w); + } + wait_empty_.store(false); + } private: alignas(64) std::atomic num_tasks_{0}; - Notifier* notifier_{nullptr}; + alignas(64) EventCount wait_empty_cv_; + alignas(64) std::atomic wait_empty_{false}; }; template diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index deeed1eb72af2..8c6eeab4d5c0a 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -13,18 +13,13 @@ namespace paddle { namespace framework { namespace { -using TaskTracker = TaskTracker; - class WorkQueueImpl : public WorkQueue { public: - explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options) { - if (options_.track_task && options.queue_empty_waiter != nullptr) { + explicit WorkQueueImpl(const WorkQueueOptions& options) + : WorkQueue(options), queue_(nullptr), tracker_(nullptr) { + if (options_.track_task) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - TaskTracker* tracker = reinterpret_cast(storage); - auto event_id = options.queue_empty_waiter->RegisterEvent( - [tracker]() { return tracker->PendingTaskNum() == 0; }); - auto notifier = options.queue_empty_waiter->GetEventNotifier(event_id); - tracker_ = new (storage) TaskTracker(notifier.get()); + tracker_ = new (storage) TaskTracker; } queue_ = new NonblockingThreadPool(options_.num_threads, options_.allow_spinning); @@ -49,11 +44,20 @@ class WorkQueueImpl : public WorkQueue { queue_->AddTask(std::move(fn)); } + void WaitQueueEmpty() override { + if (tracker_ == nullptr) { + PADDLE_THROW( + platform::errors::Unavailable("set WorkQueueOptions.track_task = " + "true before call this interface.")); + } + tracker_->WaitTaskNumToZero(); + } + size_t NumThreads() const override { return queue_->NumThreads(); } private: - NonblockingThreadPool* queue_{nullptr}; - TaskTracker* tracker_{nullptr}; + NonblockingThreadPool* queue_; + TaskTracker* tracker_; }; class WorkQueueGroupImpl : public WorkQueueGroup { @@ -65,6 +69,8 @@ class WorkQueueGroupImpl : public WorkQueueGroup { void AddTask(size_t queue_idx, std::function fn) override; + void WaitQueueGroupEmpty() override; + size_t QueueNumThreads(size_t queue_idx) const override; size_t QueueGroupNumThreads() const override; @@ -86,14 +92,9 @@ WorkQueueGroupImpl::WorkQueueGroupImpl( queues_storage_ = reinterpret_cast(buffer); for (size_t idx = 0; idx < num_queues; ++idx) { const auto& options = queues_options_[idx]; - if (options.track_task && tracker_ == nullptr && - options.queue_empty_waiter != nullptr) { + if (options.track_task && tracker_ == nullptr) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - TaskTracker* tracker = reinterpret_cast(storage); - auto event_id = options.queue_empty_waiter->RegisterEvent( - [tracker]() { return tracker->PendingTaskNum() == 0; }); - auto notifier = options.queue_empty_waiter->GetEventNotifier(event_id); - tracker_ = new (storage) TaskTracker(notifier.get()); + tracker_ = new (storage) TaskTracker; } queues_[idx] = new (&queues_storage_[idx]) NonblockingThreadPool(options.num_threads, options.allow_spinning); @@ -123,6 +124,15 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { queues_[queue_idx]->AddTask(std::move(fn)); } +void WorkQueueGroupImpl::WaitQueueGroupEmpty() { + if (nullptr == tracker_) { + PADDLE_THROW(platform::errors::Unavailable( + "set WorkQueueOptions.track_task = true for at least one of queues " + "before call this interface.")); + } + tracker_->WaitTaskNumToZero(); +} + size_t WorkQueueGroupImpl::QueueNumThreads(size_t queue_idx) const { assert(queue_idx < queues_.size()); return queues_.at(queue_idx)->NumThreads(); diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue.h index e184566f914c3..ead9d9949b700 100644 --- a/paddle/fluid/framework/new_executor/workqueue.h +++ b/paddle/fluid/framework/new_executor/workqueue.h @@ -21,29 +21,15 @@ namespace paddle { namespace framework { -class EventsWaiter; - struct WorkQueueOptions { WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task) : num_threads(num_threads), allow_spinning(allow_spinning), track_task(track_task) {} - WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task, - EventsWaiter* waiter) - : num_threads(num_threads), - allow_spinning(allow_spinning), - track_task(track_task), - queue_empty_waiter(waiter) {} - size_t num_threads; bool allow_spinning; - // If you need to blocking the calling thread to wait "queue empty", set - // track_task = true and set queue_empty_waiter. EventsWaiter::WaitEvent will - // block the calling thread until any of events (including "queue empty") - // occured. bool track_task; - EventsWaiter* queue_empty_waiter{nullptr}; }; class WorkQueue { @@ -58,8 +44,9 @@ class WorkQueue { virtual void AddTask(std::function fn) = 0; - // See WorkQueueOptions.track_task for details - // virtual void WaitQueueEmpty() = 0; + // set WorkQueueOptions.track_task = true before call this + // interface, otherwise will abort() + virtual void WaitQueueEmpty() = 0; virtual size_t NumThreads() const = 0; @@ -80,8 +67,9 @@ class WorkQueueGroup { virtual void AddTask(size_t queue_idx, std::function fn) = 0; - // See WorkQueueOptions.track_task for details - // virtual void WaitQueueGroupEmpty() = 0; + // set WorkQueueOptions.track_task = true for at least one of queues + // before call this interface, otherwise will abort() + virtual void WaitQueueGroupEmpty() = 0; virtual size_t QueueNumThreads(size_t queue_idx) const = 0; diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue_test.cc index 51a0b58a3865f..c229a84b145ab 100644 --- a/paddle/fluid/framework/new_executor/workqueue_test.cc +++ b/paddle/fluid/framework/new_executor/workqueue_test.cc @@ -16,21 +16,18 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" -#include "paddle/fluid/framework/new_executor/workqueue_utils.h" TEST(WorkQueue, TestSingleThreadedWorkQueue) { VLOG(1) << "In Test"; using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateSingleThreadedWorkQueue; - using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kLoopNum = 1000000; // CreateSingleThreadedWorkQueue - EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true, events_waiter); + /*track_task*/ true); auto work_queue = CreateSingleThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 1u); @@ -45,7 +42,7 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) { }); // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - events_waiter.WaitEvent(); + work_queue->WaitQueueEmpty(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum); } @@ -55,15 +52,13 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateMultiThreadedWorkQueue; - using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; // CreateMultiThreadedWorkQueue - EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true, events_waiter); + /*track_task*/ true); auto work_queue = CreateMultiThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 10u); @@ -80,7 +75,7 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { } // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - events_waiter.WaitEvent(); + work_queue->WaitQueueEmpty(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum); } @@ -89,17 +84,15 @@ TEST(WorkQueue, TestWorkQueueGroup) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueueGroup; using paddle::framework::CreateWorkQueueGroup; - using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; - // ThreadedWorkQueueGroup - EventsWaiter events_waiter; + // CreateMultiThreadedWorkQueue WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true, events_waiter); + /*track_task*/ true); WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true, events_waiter); + /*track_task*/ true); auto queue_group = CreateWorkQueueGroup({sq_options, mq_options}); // NumThreads EXPECT_EQ(queue_group->QueueNumThreads(0), 1u); @@ -120,6 +113,6 @@ TEST(WorkQueue, TestWorkQueueGroup) { } }); // WaitQueueGroupEmpty() - events_waiter.WaitEvent(); + queue_group->WaitQueueGroupEmpty(); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum); } diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc index 9303588152c31..2ea49e676a807 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.cc +++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc @@ -55,79 +55,5 @@ void AlignedFree(void* mem_ptr) { #endif } -constexpr EventsWaiter::EventId kEmptyEventId = -1; - -EventsWaiter::EventsWaiter() - : trigger_event_(kEmptyEventId), waiting_(false), cv_(1) {} - -EventsWaiter::EventId EventsWaiter::RegisterEvent(EventChecker checker) { - checkers_.push_back(std::move(checker)); - EventId id = checkers_.size() - 1; - Notifier n(id, *this); - notifiers_.push_back(n); - return id; -} - -std::reference_wrapper EventsWaiter::GetEventNotifier( - const EventId& id) { - int64_t event_num = checkers_.size(); - if (id < 0 || id > event_num) { - PADDLE_THROW(platform::errors::OutOfRange("Invalid EventId")); - } - return notifiers_[id]; -} - -EventsWaiter::EventId EventsWaiter::WaitEvent() { - // only one user can wait at any time - bool waiting = false; - if (!waiting_.compare_exchange_strong(waiting, true, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { - PADDLE_THROW( - platform::errors::ResourceExhausted("Another thread is waiting.")); - } - EventId id = trigger_event_.load(std::memory_order_acquire); - auto w = cv_.GetWaiter(0); - cv_.Prewait(); - int64_t event_num = checkers_.size(); - for (int64_t i = 0; id == kEmptyEventId && i < event_num; ++i) { - if (checkers_[i]()) { - id = i; - } - } - if (id != kEmptyEventId) { - cv_.CancelWait(); - } else { - cv_.CommitWait(w); - } - trigger_event_.store(kEmptyEventId); - waiting_.store(false); - return id; -} - -const std::set EventsWaiter::WaitEvents() { - std::set ids; - auto trigger_ev = WaitEvent(); - ids.insert(trigger_ev); - for (int64_t i = 0; i < static_cast(checkers_.size()); ++i) { - if (checkers_[i]()) { - ids.insert(i); - } - } - return ids; -} - -void EventsWaiter::SetTriggerEvent(const EventId& id) { - EventId expected_id = kEmptyEventId; - if (!trigger_event_.compare_exchange_strong(expected_id, id, - std::memory_order_seq_cst, - std::memory_order_release)) { - return; - } - cv_.Notify(true); -} - -void EventsWaiter::Notifier::NotifyEvent() { waiter_.SetTriggerEvent(id_); } - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index d04bf3bfe3c0c..bb219fea36267 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -18,9 +18,6 @@ #include #include #include -#include -#include -#include "paddle/fluid/framework/new_executor/event_count.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -67,52 +64,5 @@ void* AlignedMalloc(size_t size, size_t alignment); void AlignedFree(void* memory_ptr); -// A multiplexing waiter, be able to wait multi events simultaneously -// Blocking the calling thread to wait any of the registered events -class EventsWaiter { - public: - using EventId = int64_t; - - using EventChecker = std::function; - - class Notifier { - public: - void NotifyEvent(); - - private: - friend EventsWaiter; - Notifier(EventId id, EventsWaiter& waiter) : id_(id), waiter_(waiter) {} - - EventId id_; - EventsWaiter& waiter_; - }; - - EventsWaiter(); - - EventsWaiter(const EventsWaiter&) = delete; - - EventsWaiter& operator=(const EventsWaiter&) = delete; - - // RegisterEvent must be called before WaitEvent - EventId RegisterEvent(EventChecker checker); - - std::reference_wrapper GetEventNotifier(const EventId& id); - - // Wait any of the registered events - EventId WaitEvent(); - - const std::set WaitEvents(); - - private: - friend Notifier; - void SetTriggerEvent(const EventId& id); - - std::vector checkers_; - std::vector notifiers_; - std::atomic trigger_event_; - std::atomic waiting_; - EventCount cv_; -}; - } // namespace framework } // namespace paddle From ac1f4504184d0d31bd0d2bec3657eecfe20ec7d4 Mon Sep 17 00:00:00 2001 From: liutiexing Date: Wed, 8 Dec 2021 04:43:07 +0000 Subject: [PATCH 7/7] Fix RecordEvent --- paddle/fluid/platform/profiler.cc | 23 ++++++++++++++--------- paddle/fluid/platform/profiler.h | 7 +++++-- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index f6d9c8f64fd35..476a192910144 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -353,7 +353,7 @@ RecordEvent::RecordEvent(const char *name, const EventRole role) { #endif #endif if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { - RecordEvent(name, role, "none"); + OriginalConstruct(name, role, "none"); return; } shallow_copy_name_ = name; @@ -371,7 +371,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { #endif #endif if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { - RecordEvent(name, role, "none"); + OriginalConstruct(name, role, "none"); return; } name_ = new std::string(name); @@ -389,13 +389,18 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, } #endif #endif - if (g_enable_host_event_recorder_hook) { - name_ = new std::string(name); - start_ns_ = PosixInNsec(); - attr_ = new std::string(attr); + if (UNLIKELY(g_enable_host_event_recorder_hook == false)) { + OriginalConstruct(name, role, attr); return; } + name_ = new std::string(name); + start_ns_ = PosixInNsec(); + attr_ = new std::string(attr); +} +void RecordEvent::OriginalConstruct(const std::string &name, + const EventRole role, + const std::string &attr) { if (g_state == ProfilerState::kDisabled || name.empty()) return; // do some initialization @@ -408,7 +413,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role, // Maybe need the same push/pop behavior. Event *e = PushEvent(name, role, attr); SetCurAnnotation(e); - // name_ = e->name(); + *name_ = e->name(); } RecordEvent::~RecordEvent() { @@ -431,10 +436,10 @@ RecordEvent::~RecordEvent() { } else { HostEventRecorder::GetInstance().RecordEvent(*name_, start_ns_, end_ns, role_, *attr_); + delete attr_; } + delete name_; } - delete name_; - delete attr_; return; } diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index 317991160b798..9d0bdf2358900 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -139,17 +139,20 @@ struct RecordEvent { ~RecordEvent(); + void OriginalConstruct(const std::string& name, const EventRole role, + const std::string& attr); + bool is_enabled_{false}; bool is_pushed_{false}; // Event name - const std::string* name_{nullptr}; + std::string* name_{nullptr}; const char* shallow_copy_name_{nullptr}; uint64_t start_ns_; // Need to distinguish name by op type, block_id, program_id and perhaps // different kernel invocations within an op. // std::string full_name_; EventRole role_{EventRole::kOrdinary}; - const std::string* attr_{nullptr}; + std::string* attr_{nullptr}; }; /*class RecordRPCEvent {