From e1ff4a74eda953b73d1a1ce63f07d251cc84520d Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 16 Nov 2022 22:27:17 +0800 Subject: [PATCH 1/3] remove "gpu_primitives.h" in fluid namespace --- .../framework/fleet/heter_ps/feature_value.cu | 44 +- .../fluid/framework/fleet/ps_gpu_wrapper.cu | 4 +- .../plugin/fused_token_prune_op_plugin.cu | 4 +- paddle/fluid/operators/affine_channel_op.cu | 2 +- paddle/fluid/operators/assign_pos_op.cu | 4 +- paddle/fluid/operators/batch_fc_op.cu | 2 +- paddle/fluid/operators/bilateral_slice_op.cu | 2 +- paddle/fluid/operators/center_loss_op.cu | 6 +- .../operators/collective/c_embedding_op.cu | 5 +- .../fluid/operators/collective/c_split_op.cu | 2 +- paddle/fluid/operators/conv_shift_op.cu | 2 +- paddle/fluid/operators/cvm_op.cu | 4 +- paddle/fluid/operators/data_norm_op.cu | 4 +- .../operators/deformable_psroi_pooling_op.cu | 28 +- paddle/fluid/operators/dequantize_log_op.cu | 2 +- .../fluid/operators/detection/box_clip_op.cu | 2 +- .../detection/box_decoder_and_assign_op.cu | 2 +- .../detection/collect_fpn_proposals_op.cu | 4 +- .../detection/polygon_box_transform_op.cu | 4 +- .../detection/roi_perspective_transform_op.cu | 4 +- .../detection/sigmoid_focal_loss_op.cu | 2 +- .../elementwise/elementwise_op_function.h | 2 +- paddle/fluid/operators/fake_quantize_op.cu.h | 2 +- .../operators/fused/fused_softmax_mask.cu.h | 2 +- .../fluid/operators/gather_scatter_kernel.cu | 4 +- .../fluid/operators/graph_khop_sampler_op.cu | 5 +- paddle/fluid/operators/group_norm_op.cu | 8 +- paddle/fluid/operators/interpolate_op.cu | 101 ++- .../fluid/operators/limit_by_capacity_op.cu | 4 +- paddle/fluid/operators/lookup_table_op.cu | 4 +- paddle/fluid/operators/lookup_table_v2_op.cu | 6 +- .../fluid/operators/math/cos_sim_functor.cu | 4 +- paddle/fluid/operators/math/cross_entropy.cu | 2 +- paddle/fluid/operators/math/im2col.cu | 5 +- paddle/fluid/operators/math/maxouting.cu | 2 +- .../fluid/operators/math/sequence_pooling.cu | 2 +- paddle/fluid/operators/math/sequence_scale.cu | 4 +- paddle/fluid/operators/math/unpooling.cu | 2 +- paddle/fluid/operators/math/vol2col.cu | 2 +- paddle/fluid/operators/mean_iou_op.cu | 4 +- paddle/fluid/operators/number_count_op.cu | 4 +- paddle/fluid/operators/one_hot_op.cu | 4 +- paddle/fluid/operators/optimizers/sgd_op.cu | 4 +- paddle/fluid/operators/pad2d_op.cu | 18 +- paddle/fluid/operators/prroi_pool_op.h | 6 +- .../operators/prune_gate_by_capacity_op.cu | 4 +- .../operators/pull_box_extended_sparse_op.cu | 2 +- paddle/fluid/operators/pull_box_sparse_op.kps | 10 +- .../fluid/operators/pull_gpups_sparse_op.cu | 4 +- paddle/fluid/operators/quantize_linear_op.cu | 2 +- paddle/fluid/operators/random_routing_op.cu | 2 +- paddle/fluid/operators/rank_attention_op.cu | 2 +- .../sequence_ops/sequence_enumerate_op.cu | 4 +- .../sequence_ops/sequence_erase_op.cu | 4 +- .../sequence_ops/sequence_expand_as_op.cu | 2 +- .../sequence_ops/sequence_expand_op.cu | 4 +- paddle/fluid/operators/shuffle_channel_op.cu | 2 +- paddle/fluid/operators/temporal_shift_op.cu | 2 +- paddle/fluid/operators/top_k_function_cuda.h | 4 +- paddle/fluid/operators/transpose_op.cu.h | 2 +- .../platform/device/gpu/cuda_helper_test.cu | 8 +- .../platform/device/gpu/gpu_primitives.h | 606 ------------------ paddle/phi/backends/gpu/gpu_primitives.h | 2 +- 63 files changed, 193 insertions(+), 807 deletions(-) delete mode 100644 paddle/fluid/platform/device/gpu/gpu_primitives.h diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index f05fe6c95de0a5..a273c4a5e8dda3 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -13,12 +13,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_HETERPS #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace framework { -const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS; #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 @@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest, int x = low; int y = i - (x ? len[x - 1] : 0); float* feature_value_ptr = - (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); + (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size)); // NOLINT int mf_dim = gpu_dim[x] - 3; gpu_accessor.Select( dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim); @@ -79,8 +79,9 @@ __global__ void PullDedupCopy(const size_t N, return; } - float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * - uint64_t(max_val_size)); + float* src_ptr = + (float*)((char*)src + uint64_t(restore_idx[i]) *. // NOLINT + uint64_t(max_val_size)); switch (off) { case 0: *(dest_ptr + off) = src_ptr[accessor.ShowIndex()]; @@ -125,9 +126,10 @@ __global__ void PushCopyWithPool(float* dest, } int x = low; int y = i - (x ? len[low - 1] : 0); - float* cur = (float*)((char*)dest + i * grad_value_size); + float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT - cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x]; + cur[gpu_accessor.common_push_value.SlotIndex()] = + (float)slot_vector[x]; // NOLINT int mf_dim = mf_dim_vector[x]; cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim; @@ -170,31 +172,29 @@ __global__ void PushMergeCopyAtomic(const size_t N, int y = i - slot_lens[x]; const float* ptr = src[x] + y * hidden; - float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size); + float* cur = + (float*)((char*)dest + d_restore_idx[i] * grad_value_size); // NOLINT int mf_dim = slot_dims[x] - 3; switch (off) { case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT cur[accessor.MfDimIndex()] = mf_dim; - paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()], - *(ptr + off)); + phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off)); break; case 1: - paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()], - *(ptr + off)); + phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off)); break; case 2: - paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], - *(ptr + off) * -1. * bs); + phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()], + *(ptr + off) * -1. * bs); break; default: int embedx_idx = off - 3; if (mf_dim < embedx_idx) { return; } - paddle::platform::CudaAtomicAdd( - &cur[accessor.EmbedxGIndex() + embedx_idx], - *(ptr + off) * -1. * bs); + phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx], + *(ptr + off) * -1. * bs); break; } } @@ -228,7 +228,7 @@ __global__ void PushMergeCopy(const size_t N, int i = idx / hidden; int off = idx % hidden; // filter 0 keys - float* cur = (float*)((char*)dest + i * grad_value_size); + float* cur = (float*)((char*)dest + i * grad_value_size); // NOLINT if (total_keys[i] == 0) { switch (off) { @@ -262,7 +262,7 @@ __global__ void PushMergeCopy(const size_t N, switch (off) { case 0: - cur[accessor.SlotIndex()] = (float)slot_vector[x]; + cur[accessor.SlotIndex()] = (float)slot_vector[x]; // NOLINT cur[accessor.MfDimIndex()] = mf_dim; SUM_GRAD_VALUE cur[accessor.ShowIndex()] = val; @@ -331,8 +331,8 @@ void AccessorWrapper::CopyForPushImpl( const uint64_t total_length, const int batch_size, size_t grad_value_size, - std::vector& slot_vector, - std::vector& slot_mf_dim_vector) { + std::vector& slot_vector, // NOLINT + std::vector& slot_mf_dim_vector) { // NOLINT auto stream = dynamic_cast( paddle::platform::DeviceContextPool::Instance().Get(place)) ->stream(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu index 7f27b6889fc981..169b87b2b4017a 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu @@ -22,12 +22,12 @@ limitations under the License. */ #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace framework { -const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS; +const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS; #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS) #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0 diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu index e49bf16bf6878b..fe011422c19e95 100644 --- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu @@ -20,8 +20,8 @@ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h" #include "paddle/fluid/operators/fused_token_prune_op.cu.h" @@ -149,7 +149,7 @@ __global__ void ReduceSum2( } if (tid == 0) { - platform::fastAtomicAdd( + phi::fastAtomicAdd( reinterpret_cast(dst), static_cast(batch * max_seq_len + col), static_cast(bsz * max_seq_len), diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu index cb7e7a8d12812a..16c297459ce046 100644 --- a/paddle/fluid/operators/affine_channel_op.cu +++ b/paddle/fluid/operators/affine_channel_op.cu @@ -23,7 +23,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu index f5704b6a086171..0f1107765d3844 100644 --- a/paddle/fluid/operators/assign_pos_op.cu +++ b/paddle/fluid/operators/assign_pos_op.cu @@ -23,8 +23,8 @@ We retain the following license from the original files: #include "paddle/fluid/operators/assign_pos_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" DECLARE_bool(avoid_op_randomness); @@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count, CUDA_KERNEL_LOOP(i, limit) { int number_idx = numbers[i]; if (number_idx > -1) { - int p = platform::CudaAtomicAdd(cum_count + number_idx, -1); + int p = phi::CudaAtomicAdd(cum_count + number_idx, -1); out[p - 1] = i; } } diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu index b8641565729a3d..178e57d7a261a1 100644 --- a/paddle/fluid/operators/batch_fc_op.cu +++ b/paddle/fluid/operators/batch_fc_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu index 1e0d0da5dbbad0..c995c3ed091dd3 100644 --- a/paddle/fluid/operators/bilateral_slice_op.cu +++ b/paddle/fluid/operators/bilateral_slice_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/bilateral_slice_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu index fed463d8f7cd75..44495ddf32eb38 100644 --- a/paddle/fluid/operators/center_loss_op.cu +++ b/paddle/fluid/operators/center_loss_op.cu @@ -16,11 +16,11 @@ limitations under the License. */ #include "paddle/fluid/operators/center_loss_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void ComputeDifferent(T *centers_diff, @@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers, const T *diff = centers_diff + idy * D; T *cent = centers + id * D; for (int i = idx; i < D; i += BlockDimX) { - paddle::platform::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); + phi::CudaAtomicAdd(¢[i], alpha[0] * diff[i] / count); } idy += BlockDimY * GridDimX; } diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu index 53aef8e8357343..e1fa8795d420e5 100644 --- a/paddle/fluid/operators/collective/c_embedding_op.cu +++ b/paddle/fluid/operators/collective/c_embedding_op.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table, auto id = ids[row]; if (id >= start_idx && id < end_idx) { auto real_idx = id - start_idx; - paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col], - output[i]); + phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]); } } } diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu index 2089c23fa6ec55..3539a7304010ec 100644 --- a/paddle/fluid/operators/collective/c_split_op.cu +++ b/paddle/fluid/operators/collective/c_split_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/collective/c_split_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu index 689722d24eccba..047ef75d1fb39e 100644 --- a/paddle/fluid/operators/conv_shift_op.cu +++ b/paddle/fluid/operators/conv_shift_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_shift_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu index 3db8c125ec1730..e8fdcec36082a1 100644 --- a/paddle/fluid/operators/cvm_op.cu +++ b/paddle/fluid/operators/cvm_op.cu @@ -16,12 +16,12 @@ limitations under the License. */ #include "paddle/fluid/operators/cvm_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu index b040b5dfd8d61d..790e55965a9d2a 100644 --- a/paddle/fluid/operators/data_norm_op.cu +++ b/paddle/fluid/operators/data_norm_op.cu @@ -17,7 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/operators/data_norm_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/collective_helper.h" #include "paddle/fluid/platform/device/gpu/nccl_helper.h" @@ -29,7 +29,7 @@ namespace operators { using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; using DataLayout = phi::DataLayout; -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu index 002f89b1620822..f1816850317a16 100644 --- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu +++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu @@ -32,7 +32,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -41,7 +41,7 @@ namespace operators { using Tensor = phi::DenseTensor; using LoDTensor = phi::DenseTensor; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; static inline int GET_BLOCKS(const int N) { return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; @@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( // compute gradient of input if (bottom_data_diff) { - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y0 * width + x0, - q00 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y1 * width + x0, - q01 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y0 * width + x1, - q10 * diff_val); - platform::CudaAtomicAdd( - bottom_data_diff + bottom_index + y1 * width + x1, - q11 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0, + q00 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0, + q01 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1, + q10 * diff_val); + phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1, + q11 * diff_val); } // compute gradient of trans @@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel( u00 * (1 - dist_x)) * trans_std * diff_val; diff_y *= roi_height; - platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); - platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); + phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x); + phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y); } } } diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 360871f9e7251b..423363d49989ac 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu index f7239b406b8fd4..8fc8ec221f3e82 100644 --- a/paddle/fluid/operators/detection/box_clip_op.cu +++ b/paddle/fluid/operators/detection/box_clip_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detection/box_clip_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math_function.h" diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu index 6acc3845d24088..daae995de0d5ea 100644 --- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu +++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h" #include "paddle/fluid/memory/memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu index f244bdca35eb7c..18e52957d1acb7 100644 --- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu +++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu @@ -26,8 +26,8 @@ namespace cub = hipcub; #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h" #include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/strided_memcpy.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/for_range.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/gather.cu.h" namespace paddle { @@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads, const int* batch_ids, int* length_lod) { CUDA_KERNEL_LOOP(i, nthreads) { - platform::CudaAtomicAdd(length_lod + batch_ids[i], 1); + phi::CudaAtomicAdd(length_lod + batch_ids[i], 1); } } diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu index 49e3d3d96ba5d8..bbeb9f7f2858a9 100644 --- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu +++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu @@ -14,13 +14,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { using Tensor = phi::DenseTensor; -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; #define CUDA_BLOCK_SIZE 16 template diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index cd298e50cad69c..0c339b5f219f6e 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -15,12 +15,12 @@ limitations under the License. */ #include #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" using paddle::platform::float16; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index bad93fd22b2e9b..7971fa0acde3d9 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" #include "paddle/fluid/operators/math.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" namespace paddle { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index ecdec98339b42b..7bcd336732960e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -43,7 +43,7 @@ limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/gpu/elementwise_grad.h" #endif diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h index a5f9f03493706c..b6dd3ca8f64b27 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu.h +++ b/paddle/fluid/operators/fake_quantize_op.cu.h @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/fake_quantize_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h index 60723c6cb5d17c..12e511fe3aef9d 100644 --- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h +++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/aligned_vector.h" namespace paddle { diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu index 80dbce4b24d285..2f17b946c6149d 100644 --- a/paddle/fluid/operators/gather_scatter_kernel.cu +++ b/paddle/fluid/operators/gather_scatter_kernel.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/gather_scatter_kernel.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -35,7 +35,7 @@ class ReduceAdd { typename tensor_t, std::enable_if_t::value>* = nullptr> __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const { - platform::CudaAtomicAdd(self_data, *src_data); + phi::CudaAtomicAdd(self_data, *src_data); } template ::value>* = nullptr> diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu index c83419f3092379..2e703282bf9324 100644 --- a/paddle/fluid/operators/graph_khop_sampler_op.cu +++ b/paddle/fluid/operators/graph_khop_sampler_op.cu @@ -41,8 +41,8 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/graph_khop_sampler_imp.h" #include "paddle/fluid/operators/graph_khop_sampler_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/place.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" constexpr int WARP_SIZE = 32; @@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed, const int num = curand(&rng) % (idx + 1); #endif if (num < k) { - paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num, - idx); + phi::CudaAtomicMax(output_idxs + out_row_start + num, idx); } } #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu index 12a989bc82b1ca..8bb400107aac49 100644 --- a/paddle/fluid/operators/group_norm_op.cu +++ b/paddle/fluid/operators/group_norm_op.cu @@ -22,7 +22,7 @@ namespace cub = hipcub; #include "paddle/fluid/operators/group_norm_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) { typedef cub::WarpReduce WarpReduce; typename WarpReduce::TempStorage temp_storage; value = WarpReduce(temp_storage).Sum(value); - if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value); + if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value); } template @@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x, if (flags & kHasScale) { #if CUDA_VERSION >= 11070 - platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); + phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data); #else CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data); #endif } if (flags & kHasBias) { #if CUDA_VERSION >= 11070 - platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); + phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data); #else CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data); #endif diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu index 0e6755dd0b7ba1..a589b49500e0ae 100644 --- a/paddle/fluid/operators/interpolate_op.cu +++ b/paddle/fluid/operators/interpolate_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/interpolate_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in, in_img_idx * num_channels + channel_id]; } const T out_pos = out[out_id_h * output_w + out_id_w]; - platform::CudaAtomicAdd(in_pos, out_pos); + phi::CudaAtomicAdd(in_pos, out_pos); } } @@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in, const T* out_pos = &out[out_id_w]; if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]); } else { - platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]); } } } @@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; if (data_layout == DataLayout::kNCHW) { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], - h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w], + h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id], + h1lambda * w1lambda * out_pos[0]); } else { - platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[w_id * num_channels], - h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], - h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[w_id * num_channels], + h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels], + h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos[h_id * in_img_w * num_channels + w_id * num_channels], h1lambda * w1lambda * out_pos[0]); } @@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], - d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], - d1lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id], + d2lambda * h1lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id], + d1lambda * h1lambda * w1lambda * out_pos[0]); } else { int in_pos1_idx = out_id_h * input_w + in_img_idt * in_img_h * in_img_w * num_channels + @@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in, const T* out_pos = &out[out_id_h * output_w + out_id_w]; // trilinear interpolation grad - platform::CudaAtomicAdd(&in_pos1[0], - d2lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[w_id * num_channels], - d2lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], - d2lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos1[0], + d2lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[w_id * num_channels], + d2lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels], + d2lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels], d2lambda * h1lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[0], - d1lambda * h2lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[w_id * num_channels], - d1lambda * h2lambda * w1lambda * out_pos[0]); - platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], - d1lambda * h1lambda * w2lambda * out_pos[0]); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd(&in_pos2[0], + d1lambda * h2lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[w_id * num_channels], + d1lambda * h2lambda * w1lambda * out_pos[0]); + phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels], + d1lambda * h1lambda * w2lambda * out_pos[0]); + phi::CudaAtomicAdd( &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels], d1lambda * h1lambda * w1lambda * out_pos[0]); } @@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in, in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels + access_x * num_channels + channel_id]; } - platform::CudaAtomicAdd(&in_pos[0], - (out_pos[0] * y_coeffs[j] * x_coeffs[i])); + phi::CudaAtomicAdd(&in_pos[0], + (out_pos[0] * y_coeffs[j] * x_coeffs[i])); } } } diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu index 4ca7a03b489be5..f6e0bffa1d1ce9 100644 --- a/paddle/fluid/operators/limit_by_capacity_op.cu +++ b/paddle/fluid/operators/limit_by_capacity_op.cu @@ -22,8 +22,8 @@ #include "paddle/fluid/operators/limit_by_capacity_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl( wid = i / n_expert; eid = i % n_expert; auto proposal = expc[wid * n_expert + eid]; - auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1)); + auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1)); if (cap_left >= proposal) { out[wid * n_expert + eid] = proposal; } else if (cap_left >= 0) { diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu index 073077f6586fae..0562228f516fac 100644 --- a/paddle/fluid/operators/lookup_table_op.cu +++ b/paddle/fluid/operators/lookup_table_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table, const T *out = output + idy * D; T *tab = table + id * D; for (int i = idx; i < D; i += BlockDimX) { - paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + phi::CudaAtomicAdd(&tab[i], out[i]); } idy += BlockDimY * GridDimX; } diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu index 41be6b34e6e5b6..a3d8c91d862652 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cu +++ b/paddle/fluid/operators/lookup_table_v2_op.cu @@ -15,8 +15,8 @@ limitations under the License. */ #include "paddle/fluid/operators/lookup_table_v2_op.h" #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table, const T *out = output + idy * D; T *tab = table + id * D; #ifdef PADDLE_WITH_CUDA - paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); + phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab); #else for (int i = idx; i < D; i += blockDim.x) { - paddle::platform::CudaAtomicAdd(&tab[i], out[i]); + phi::CudaAtomicAdd(&tab[i], out[i]); } #endif idy += blockDim.y * gridDim.x; diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu index cbe76844519a15..bb04df0879bf6f 100644 --- a/paddle/fluid/operators/math/cos_sim_functor.cu +++ b/paddle/fluid/operators/math/cos_sim_functor.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/cos_sim_functor.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm, for (size_t i = 0; i < cols; ++i) { T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod - z_data * y[i] * reciprocal_y_norm_square); - platform::CudaAtomicAdd(dy + i, dy_data); + phi::CudaAtomicAdd(dy + i, dy_data); } } } diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu index 0e5b95542455e3..6947801bf76eab 100644 --- a/paddle/fluid/operators/math/cross_entropy.cu +++ b/paddle/fluid/operators/math/cross_entropy.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/operators/math.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu index 843e50c50a697b..5c7038714e93c6 100644 --- a/paddle/fluid/operators/math/im2col.cu +++ b/paddle/fluid/operators/math/im2col.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data, if (height_offset >= 0 && height_offset < im_height && width_offset >= 0 && width_offset < im_width) { - paddle::platform::CudaAtomicAdd(im_data + im_offset, - col_data[col_offset]); + phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]); } } } diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu index df115fd16966db..9f1d2286395a47 100644 --- a/paddle/fluid/operators/math/maxouting.cu +++ b/paddle/fluid/operators/math/maxouting.cu @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/maxouting.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index eadf0d070b9018..530b68bbfbb3c9 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/sequence_pooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/macros.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu index a0bb2a1ac33ced..21010ca33148e0 100644 --- a/paddle/fluid/operators/math/sequence_scale.cu +++ b/paddle/fluid/operators/math/sequence_scale.cu @@ -13,14 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence_scale.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { namespace math { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void SequenceScaleKernel(T* seq, diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index e3d7abb6e0d716..0ecac6c5fb07a7 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/unpooling.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu index 765f31eba34f01..999e29470ebbd1 100644 --- a/paddle/fluid/operators/math/vol2col.cu +++ b/paddle/fluid/operators/math/vol2col.cu @@ -17,8 +17,8 @@ limitations under the License. */ #include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu index 3e7f8a5363ac0b..e73496a46a0add 100644 --- a/paddle/fluid/operators/mean_iou_op.cu +++ b/paddle/fluid/operators/mean_iou_op.cu @@ -15,13 +15,13 @@ limitations under the License. */ #include "paddle/fluid/operators/mean_iou_op.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void CountCUDAKernel(const int num_classes, diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu index dcbf95d059185d..25541ebdb36217 100644 --- a/paddle/fluid/operators/number_count_op.cu +++ b/paddle/fluid/operators/number_count_op.cu @@ -22,8 +22,8 @@ #include "paddle/fluid/operators/number_count_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers, #endif } if (threadIdx.x % WARP_SIZE == 0) { - platform::CudaAtomicAdd(number_count + i, x); + phi::CudaAtomicAdd(number_count + i, x); } } } diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu index 1a2939366f3910..b36ca97b3e40f9 100644 --- a/paddle/fluid/operators/one_hot_op.cu +++ b/paddle/fluid/operators/one_hot_op.cu @@ -14,11 +14,11 @@ #include "paddle/fluid/operators/one_hot_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void FillOutputKernel(const InT* p_in_data, diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu index 05b00bac890a71..385e9a70e5489b 100644 --- a/paddle/fluid/operators/optimizers/sgd_op.cu +++ b/paddle/fluid/operators/optimizers/sgd_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/optimizers/sgd_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) { // Since index in rows of SelectedRows can be duplicate, we have to use // Atomic Operation to avoid concurrent write error. - paddle::platform::CudaAtomicAdd( + phi::CudaAtomicAdd( tensor_out_ptr + index, -static_cast(1.0) * learning_rate[0] * selected_rows_ptr[index]); } diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu index c76a6b61e780e9..7b0dd2149dead5 100644 --- a/paddle/fluid/operators/pad2d_op.cu +++ b/paddle/fluid/operators/pad2d_op.cu @@ -16,13 +16,13 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/math_function.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void Pad2DConstNCHW(const int nthreads, @@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size, in_w = max(in_w, -in_w); in_h = min(in_h, 2 * in_height - in_h - 2); in_w = min(in_w, 2 * in_width - in_w - 2); - platform::CudaAtomicAdd( - &d_in_data[(nc * in_height + in_h) * in_width + in_w], - d_out_data[out_index]); + phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); } } @@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size, in_w = max(in_w, -in_w); in_h = min(in_h, in_height * 2 - in_h - 2); in_w = min(in_w, in_width * 2 - in_w - 2); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], d_out_data[out_index]); } @@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size, nc /= out_height; const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - platform::CudaAtomicAdd( - &d_in_data[(nc * in_height + in_h) * in_width + in_w], - d_out_data[out_index]); + phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w], + d_out_data[out_index]); } } @@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size, n /= out_height; const int in_h = min(in_height - 1, max(out_h - pad_top, 0)); const int in_w = min(in_width - 1, max(out_w - pad_left, 0)); - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c], d_out_data[out_index]); } diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h index d4375b2fc48ce4..07a2bde7e94e46 100644 --- a/paddle/fluid/operators/prroi_pool_op.h +++ b/paddle/fluid/operators/prroi_pool_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/phi/kernels/funcs/math_function.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #endif namespace paddle { @@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff, const T coeff) { bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width); if (!overflow) { - paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); + phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff); } } #else @@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff, #if defined(__NVCC__) || defined(__HIPCC__) template DEVICE void AccumulateRois(T* offset, T data) { - paddle::platform::CudaAtomicAdd(offset, data); + phi::CudaAtomicAdd(offset, data); } #else template diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu index 9f5751fe0bdc7e..f21d640384c4d4 100644 --- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu +++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu @@ -21,7 +21,7 @@ // Licensed under the Apache License, Version 2.0 (the "License"). #include "paddle/fluid/operators/prune_gate_by_capacity_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace ops = paddle::operators; namespace plat = paddle::platform; @@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data, const int64_t batch_size) { CUDA_KERNEL_LOOP(i, batch_size) { auto orig_cap = - platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); + phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1); if (orig_cap <= 0) { new_gate_idx_data[i] = -1; } else { diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu index 26a02ea622479f..cfa317a3d392fb 100644 --- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu @@ -14,7 +14,7 @@ #include "paddle/fluid/operators/pull_box_extended_sparse_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps index 6b7c7c8495108e..4b0580c5e1ab5c 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.kps +++ b/paddle/fluid/operators/pull_box_sparse_op.kps @@ -37,7 +37,7 @@ limitations under the License. */ #include "xpu/kernel/math.h" // NOLINT #else #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #endif #include "paddle/fluid/operators/pull_box_sparse_op.h" @@ -46,9 +46,13 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; #ifdef PADDLE_WITH_XPU_KP -REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace, +REGISTER_OP_KERNEL(pull_box_sparse, + KP, + plat::XPUPlace, ops::PullBoxSparseKernel); -REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace, +REGISTER_OP_KERNEL(push_box_sparse, + KP, + plat::XPUPlace, ops::PushBoxSparseKernel); #else REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel); diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu index 996eacf428979d..d22c632d60dd25 100644 --- a/paddle/fluid/operators/pull_gpups_sparse_op.cu +++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu @@ -14,11 +14,11 @@ #include "paddle/fluid/operators/pull_gpups_sparse_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu index c5d8b1928fd78e..259c1507af038a 100644 --- a/paddle/fluid/operators/quantize_linear_op.cu +++ b/paddle/fluid/operators/quantize_linear_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/fake_dequantize_op.cu.h" #include "paddle/fluid/operators/fake_quantize_op.cu.h" #include "paddle/fluid/operators/quantize_linear_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu index 287a523f61f3bc..1fdb1bf73a3047 100644 --- a/paddle/fluid/operators/random_routing_op.cu +++ b/paddle/fluid/operators/random_routing_op.cu @@ -14,8 +14,8 @@ #include "paddle/fluid/operators/random_routing_op.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu index 36117e605031eb..8107e520b0492e 100644 --- a/paddle/fluid/operators/rank_attention_op.cu +++ b/paddle/fluid/operators/rank_attention_op.cu @@ -18,7 +18,7 @@ limitations under the License. */ #include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/kernels/funcs/blas/blas.h" namespace paddle { diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu index 6a1afcc18e68c7..0f53f292ef8ae4 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu @@ -16,11 +16,11 @@ #include #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu index 74789ecde9d386..d8b0afbc85dc59 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu @@ -16,11 +16,11 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { -using platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; using LoDTensor = phi::DenseTensor; template diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu index d5beedd35338ae..f565e0d438a0e6 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu @@ -15,7 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu index 2c9b6408a7390f..e4ebd47878cb2b 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu @@ -16,7 +16,7 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { @@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data, for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) { for (int tid_x = threadIdx.x; tid_x < x_item_length; tid_x += blockDim.x) { - platform::CudaAtomicAdd( + phi::CudaAtomicAdd( &dx_data[(x_offset + tid_y) * x_item_length + tid_x], dout_data[(out_offset + tid_z * x_item_count + tid_y) * x_item_length + diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu index 26eee095377c0a..4869a4c6c5e223 100644 --- a/paddle/fluid/operators/shuffle_channel_op.cu +++ b/paddle/fluid/operators/shuffle_channel_op.cu @@ -11,7 +11,7 @@ limitations under the License. */ #include "paddle/fluid/operators/shuffle_channel_op.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu index cad6f416d41d8d..d2583aeb143ec6 100644 --- a/paddle/fluid/operators/temporal_shift_op.cu +++ b/paddle/fluid/operators/temporal_shift_op.cu @@ -11,7 +11,7 @@ #include "paddle/fluid/operators/temporal_shift_op.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h index faf2b08089157d..e95bca3c2791ef 100644 --- a/paddle/fluid/operators/top_k_function_cuda.h +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -28,8 +28,8 @@ limitations under the License. */ #include "paddle/fluid/operators/top_k_op.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #define FINAL_MASK 0xffffffff #ifdef __HIPCC__ @@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input, if (GetLaneId() == 0) { #pragma unroll for (uint32_t i = 0; i < RadixSize; ++i) { - platform::CudaAtomicAdd(&shared_mem[i], counts[i]); + phi::CudaAtomicAdd(&shared_mem[i], counts[i]); } } diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h index eb9e8a7bed7845..4fc610c393f103 100644 --- a/paddle/fluid/operators/transpose_op.cu.h +++ b/paddle/fluid/operators/transpose_op.cu.h @@ -16,9 +16,9 @@ limitations under the License. */ #include "paddle/fluid/framework/gpu_utils.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/fast_divmod.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/autotune/auto_tune_base.h" diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu index 68229bba74ccd8..a3fff0dbed8e2e 100644 --- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu +++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu @@ -24,17 +24,15 @@ #define PADDLE_CUDA_FP16 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_helper.h" -#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/float16.h" +#include "paddle/phi/backends/gpu/gpu_primitives.h" using paddle::platform::float16; -using paddle::platform::PADDLE_CUDA_NUM_THREADS; +using phi::PADDLE_CUDA_NUM_THREADS; template __global__ void AddKernel(const T* data_a, T* data_b, size_t num) { - CUDA_KERNEL_LOOP(i, num) { - paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]); - } + CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); } } template diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h deleted file mode 100644 index 96eddf09237d98..00000000000000 --- a/paddle/fluid/platform/device/gpu/gpu_primitives.h +++ /dev/null @@ -1,606 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#ifdef PADDLE_WITH_CUDA -#include -#endif -#ifdef PADDLE_WITH_HIP -#include -#endif -#include - -#include "paddle/fluid/platform/bfloat16.h" -#include "paddle/fluid/platform/complex.h" -#include "paddle/fluid/platform/float16.h" - -namespace paddle { -namespace platform { - -#define CUDA_ATOMIC_WRAPPER(op, T) \ - __device__ __forceinline__ T CudaAtomic##op(T *address, const T val) - -#define USE_CUDA_ATOMIC(op, T) \ - CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); } - -// Default thread count per block(or block size). -// TODO(typhoonzero): need to benchmark against setting this value -// to 1024. -constexpr int PADDLE_CUDA_NUM_THREADS = 512; - -// For atomicAdd. -USE_CUDA_ATOMIC(Add, float); -USE_CUDA_ATOMIC(Add, int); -USE_CUDA_ATOMIC(Add, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -USE_CUDA_ATOMIC(Add, unsigned long long int); // NOLINT - -CUDA_ATOMIC_WRAPPER(Add, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - return CudaAtomicAdd( - reinterpret_cast(address), // NOLINT - static_cast(val)); // NOLINT -} - -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600) -USE_CUDA_ATOMIC(Add, double); -#else -CUDA_ATOMIC_WRAPPER(Add, double) { - unsigned long long int *address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - old = atomicCAS(address_as_ull, - assumed, - __double_as_longlong(val + __longlong_as_double(assumed))); - - // Note: uses integer comparison to avoid hang in case of NaN - } while (assumed != old); - - return __longlong_as_double(old); -} -#endif - -#ifdef PADDLE_CUDA_FP16 -// NOTE(dzhwinter): cuda do not have atomicCAS for half. -// Just use the half address as a unsigned value address and -// do the atomicCAS. According to the value store at high 16 bits -// or low 16 bits, then do a different sum and CAS. -// Given most warp-threads will failed on the atomicCAS, so this -// implemented should be avoided in high concurrency. It's will be -// slower than the way convert value into 32bits and do a full atomicCAS. - -// convert the value into float and do the add arithmetic. -// then store the result into a uint32. -inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) { - float16 low_half; - // the float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(static_cast(low_half) + x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) { - float16 high_half; - // the float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(static_cast(high_half) + x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 -static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) { - return *reinterpret_cast(&x); -} - -static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) { - return *reinterpret_cast<__half *>(&x); -} - -CUDA_ATOMIC_WRAPPER(Add, float16) { - return CUDAFP16ToPDFP16( - atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val))); -} -#else -CUDA_ATOMIC_WRAPPER(Add, float16) { - // concrete packed float16 value may exsits in lower or higher 16bits - // of the 32bits address. - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t sum; - uint32_t newval; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // the float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // the float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )" -// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up. -template ::value>::type * = nullptr> -__device__ __forceinline__ void fastAtomicAdd(T *tensor, - size_t index, - const size_t numel, - T value) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - CudaAtomicAdd(reinterpret_cast(tensor) + index, - static_cast(value)); -#else - // whether the address is 32-byte aligned. - __half *target_addr = reinterpret_cast<__half *>(tensor + index); - bool aligned_half2 = - (reinterpret_cast(target_addr) % sizeof(__half2) == 0); - - if (aligned_half2 && index < (numel - 1)) { - __half2 value2; - value2.x = *reinterpret_cast<__half *>(&value); - value2.y = __int2half_rz(0); - atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2); - - } else if (!aligned_half2 && index > 0) { - __half2 value2; - value2.x = __int2half_rz(0); - value2.y = *reinterpret_cast<__half *>(&value); - atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2); - - } else { - atomicAdd(reinterpret_cast<__half *>(tensor) + index, - *reinterpret_cast<__half *>(&value)); - } -#endif -} - -template ::value>::type * = nullptr> -__device__ __forceinline__ void fastAtomicAdd(T *arr, - size_t index, - const size_t numel, - T value) { - CudaAtomicAdd(arr + index, value); -} -#endif - -// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16. -inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) { - bfloat16 low_half; - // the bfloat16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(static_cast(low_half) + x); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) { - bfloat16 high_half; - // the bfloat16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(static_cast(high_half) + x); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 -static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) { - return *reinterpret_cast(&x); -} - -static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) { - return *reinterpret_cast<__nv_bfloat16 *>(&x); -} - -CUDA_ATOMIC_WRAPPER(Add, bfloat16) { - return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address), - PDBF16ToCUDABF16(val))); -} -#else -CUDA_ATOMIC_WRAPPER(Add, bfloat16) { - // concrete packed bfloat16 value may exsits in lower or higher 16bits - // of the 32bits address. - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t sum; - uint32_t newval; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // the bfloat16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f)); - } while (old != assumed); - bfloat16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // the bfloat16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS( - address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f)); - } while (old != assumed); - bfloat16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -CUDA_ATOMIC_WRAPPER(Add, complex) { - float *real = reinterpret_cast(address); - float *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} - -CUDA_ATOMIC_WRAPPER(Add, complex) { - double *real = reinterpret_cast(address); - double *imag = real + 1; - return complex(CudaAtomicAdd(real, val.real), - CudaAtomicAdd(imag, val.imag)); -} - -// For atomicMax -USE_CUDA_ATOMIC(Max, int); -USE_CUDA_ATOMIC(Max, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) -USE_CUDA_ATOMIC(Max, unsigned long long int); // NOLINT -#else -CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) { // NOLINT - if (*address >= val) { - return *address; - } - - unsigned long long int old = *address, assumed; // NOLINT - - do { - assumed = old; - if (assumed >= val) { - break; - } - - old = atomicCAS(address, assumed, val); - } while (assumed != old); -} -#endif - -CUDA_ATOMIC_WRAPPER(Max, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - long long int res = *address; // NOLINT - while (val > res) { - long long int old = res; // NOLINT - res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT - (unsigned long long int)old, // NOLINT - (unsigned long long int)val); // NOLINT - if (res == old) { - break; - } - } - return res; -} - -CUDA_ATOMIC_WRAPPER(Max, float) { - if (*address >= val) { - return *address; - } - - int *const address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) >= val) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(val)); - } while (assumed != old); - - return __int_as_float(old); -} - -CUDA_ATOMIC_WRAPPER(Max, double) { - if (*address >= val) { - return *address; - } - - unsigned long long int *const address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - if (__longlong_as_double(assumed) >= val) { - break; - } - - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); - } while (assumed != old); - - return __longlong_as_double(old); -} - -#ifdef PADDLE_CUDA_FP16 -inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) { - float16 low_half; - // The float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(max(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) { - float16 high_half; - // The float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(max(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Max, float16) { - if (*address >= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -// For atomicMin -USE_CUDA_ATOMIC(Min, int); -USE_CUDA_ATOMIC(Min, unsigned int); -// CUDA API uses unsigned long long int, we cannot use uint64_t here. -// It because unsigned long long int is not necessarily uint64_t -#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350) -USE_CUDA_ATOMIC(Min, unsigned long long int); // NOLINT -#else -CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) { // NOLINT - if (*address <= val) { - return *address; - } - - unsigned long long int old = *address, assumed; // NOLINT - - do { - assumed = old; - if (assumed <= val) { - break; - } - - old = atomicCAS(address, assumed, val); - } while (assumed != old); -} -#endif - -CUDA_ATOMIC_WRAPPER(Min, int64_t) { - // Here, we check long long int must be int64_t. - static_assert(sizeof(int64_t) == sizeof(long long int), // NOLINT - "long long should be int64"); - long long int res = *address; // NOLINT - while (val < res) { - long long int old = res; // NOLINT - res = (long long int)atomicCAS((unsigned long long int *)address, // NOLINT - (unsigned long long int)old, // NOLINT - (unsigned long long int)val); // NOLINT - if (res == old) { - break; - } - } - return res; -} - -CUDA_ATOMIC_WRAPPER(Min, float) { - if (*address <= val) { - return *address; - } - - int *const address_as_i = reinterpret_cast(address); - int old = *address_as_i, assumed; - - do { - assumed = old; - if (__int_as_float(assumed) <= val) { - break; - } - - old = atomicCAS(address_as_i, assumed, __float_as_int(val)); - } while (assumed != old); - - return __int_as_float(old); -} - -CUDA_ATOMIC_WRAPPER(Min, double) { - if (*address <= val) { - return *address; - } - - unsigned long long int *const address_as_ull = // NOLINT - reinterpret_cast(address); // NOLINT - unsigned long long int old = *address_as_ull, assumed; // NOLINT - - do { - assumed = old; - if (__longlong_as_double(assumed) <= val) { - break; - } - - old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val)); - } while (assumed != old); - - return __longlong_as_double(old); -} - -#ifdef PADDLE_CUDA_FP16 -inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) { - float16 low_half; - // The float16 in lower 16bits - low_half.x = static_cast(val & 0xFFFFu); - low_half = static_cast(min(static_cast(low_half), x)); - return (val & 0xFFFF0000u) | low_half.x; -} - -inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) { - float16 high_half; - // The float16 in higher 16bits - high_half.x = static_cast(val >> 16); - high_half = static_cast(min(static_cast(high_half), x)); - return (val & 0xFFFFu) | (static_cast(high_half.x) << 16); -} - -CUDA_ATOMIC_WRAPPER(Min, float16) { - if (*address <= val) { - return *address; - } - uint32_t *address_as_ui = reinterpret_cast( - reinterpret_cast(address) - - (reinterpret_cast(address) & 0x02)); - float val_f = static_cast(val); - uint32_t old = *address_as_ui; - uint32_t assumed; - if (((uintptr_t)address & 0x02) == 0) { - // The float16 value stay at lower 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old & 0xFFFFu; - return ret; - } else { - // The float16 value stay at higher 16 bits of the address. - do { - assumed = old; - old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f)); - } while (old != assumed); - float16 ret; - ret.x = old >> 16; - return ret; - } -} -#endif - -#ifdef PADDLE_CUDA_FP16 -#ifdef PADDLE_WITH_CUDA -/* - * One thead block deals with elementwise atomicAdd for vector of len. - * @in: [x1, x2, x3, ...] - * @out:[y1+x1, y2+x2, y3+x3, ...] - * */ -template ::value>::type * = nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -} - -// Note: assume that len is even. If len is odd, call fastAtomicAdd directly. -template ::value>::type * = nullptr> -__device__ __forceinline__ void VectorizedAtomicAddPerBlock( - const int64_t len, int tid, int threads_per_block, const T *in, T *out) { -#if ((CUDA_VERSION < 10000) || \ - (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) - for (int i = tid; i < len; i += threads_per_block) { - CudaAtomicAdd(&out[i], in[i]); - } -#else - int i = 0; - int loops = len / 2 * 2; - - bool aligned_half2 = - (reinterpret_cast(out) % sizeof(__half2) == 0); - - if (aligned_half2) { - for (i = tid * 2; i < loops; i += threads_per_block * 2) { - __half2 value2; - T value_1 = in[i]; - T value_2 = in[i + 1]; - value2.x = *reinterpret_cast<__half *>(&value_1); - value2.y = *reinterpret_cast<__half *>(&value_2); - atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2); - } - for (; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } else { - for (int i = tid; i < len; i += threads_per_block) { - fastAtomicAdd(out, i, len, in[i]); - } - } -#endif -} -#endif -#endif -} // namespace platform -} // namespace paddle diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h index be08f29aa81502..2924b1b6bb3df2 100644 --- a/paddle/phi/backends/gpu/gpu_primitives.h +++ b/paddle/phi/backends/gpu/gpu_primitives.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. From fa0d9f919f6b01313532568962337a678e8596f1 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Wed, 16 Nov 2022 22:40:43 +0800 Subject: [PATCH 2/3] fix PR-CI-GpuPS fail --- paddle/fluid/framework/fleet/heter_ps/feature_value.cu | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu index a273c4a5e8dda3..80a827e6ad0e89 100644 --- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu +++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu @@ -79,9 +79,8 @@ __global__ void PullDedupCopy(const size_t N, return; } - float* src_ptr = - (float*)((char*)src + uint64_t(restore_idx[i]) *. // NOLINT - uint64_t(max_val_size)); + float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) * // NOLINT + uint64_t(max_val_size)); switch (off) { case 0: *(dest_ptr + off) = src_ptr[accessor.ShowIndex()]; From 2f140abdc1e742eb7422b5cc76c133e5deaff6c5 Mon Sep 17 00:00:00 2001 From: Wang Xin Date: Thu, 17 Nov 2022 13:25:14 +0800 Subject: [PATCH 3/3] fix PR-CI-GpuPS fail --- paddle/fluid/operators/dequantize_log_op.cu | 2 +- paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 423363d49989ac..18719c76b2ef08 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/dequantize_log_op.h" -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" +#include "paddle/phi/kernels/funcs/math.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu index e8c14269878006..56d28c20dc8e76 100644 --- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu +++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h" -#include "paddle/fluid/operators/math.h" #include "paddle/phi/backends/gpu/gpu_primitives.h" #include "paddle/phi/core/hostdevice.h" #include "paddle/phi/kernels/funcs/math.h"