From e1ff4a74eda953b73d1a1ce63f07d251cc84520d Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Wed, 16 Nov 2022 22:27:17 +0800
Subject: [PATCH 1/3] remove "gpu_primitives.h" in fluid namespace

---
 .../framework/fleet/heter_ps/feature_value.cu |  44 +-
 .../fluid/framework/fleet/ps_gpu_wrapper.cu   |   4 +-
 .../plugin/fused_token_prune_op_plugin.cu     |   4 +-
 paddle/fluid/operators/affine_channel_op.cu   |   2 +-
 paddle/fluid/operators/assign_pos_op.cu       |   4 +-
 paddle/fluid/operators/batch_fc_op.cu         |   2 +-
 paddle/fluid/operators/bilateral_slice_op.cu  |   2 +-
 paddle/fluid/operators/center_loss_op.cu      |   6 +-
 .../operators/collective/c_embedding_op.cu    |   5 +-
 .../fluid/operators/collective/c_split_op.cu  |   2 +-
 paddle/fluid/operators/conv_shift_op.cu       |   2 +-
 paddle/fluid/operators/cvm_op.cu              |   4 +-
 paddle/fluid/operators/data_norm_op.cu        |   4 +-
 .../operators/deformable_psroi_pooling_op.cu  |  28 +-
 paddle/fluid/operators/dequantize_log_op.cu   |   2 +-
 .../fluid/operators/detection/box_clip_op.cu  |   2 +-
 .../detection/box_decoder_and_assign_op.cu    |   2 +-
 .../detection/collect_fpn_proposals_op.cu     |   4 +-
 .../detection/polygon_box_transform_op.cu     |   4 +-
 .../detection/roi_perspective_transform_op.cu |   4 +-
 .../detection/sigmoid_focal_loss_op.cu        |   2 +-
 .../elementwise/elementwise_op_function.h     |   2 +-
 paddle/fluid/operators/fake_quantize_op.cu.h  |   2 +-
 .../operators/fused/fused_softmax_mask.cu.h   |   2 +-
 .../fluid/operators/gather_scatter_kernel.cu  |   4 +-
 .../fluid/operators/graph_khop_sampler_op.cu  |   5 +-
 paddle/fluid/operators/group_norm_op.cu       |   8 +-
 paddle/fluid/operators/interpolate_op.cu      | 101 ++-
 .../fluid/operators/limit_by_capacity_op.cu   |   4 +-
 paddle/fluid/operators/lookup_table_op.cu     |   4 +-
 paddle/fluid/operators/lookup_table_v2_op.cu  |   6 +-
 .../fluid/operators/math/cos_sim_functor.cu   |   4 +-
 paddle/fluid/operators/math/cross_entropy.cu  |   2 +-
 paddle/fluid/operators/math/im2col.cu         |   5 +-
 paddle/fluid/operators/math/maxouting.cu      |   2 +-
 .../fluid/operators/math/sequence_pooling.cu  |   2 +-
 paddle/fluid/operators/math/sequence_scale.cu |   4 +-
 paddle/fluid/operators/math/unpooling.cu      |   2 +-
 paddle/fluid/operators/math/vol2col.cu        |   2 +-
 paddle/fluid/operators/mean_iou_op.cu         |   4 +-
 paddle/fluid/operators/number_count_op.cu     |   4 +-
 paddle/fluid/operators/one_hot_op.cu          |   4 +-
 paddle/fluid/operators/optimizers/sgd_op.cu   |   4 +-
 paddle/fluid/operators/pad2d_op.cu            |  18 +-
 paddle/fluid/operators/prroi_pool_op.h        |   6 +-
 .../operators/prune_gate_by_capacity_op.cu    |   4 +-
 .../operators/pull_box_extended_sparse_op.cu  |   2 +-
 paddle/fluid/operators/pull_box_sparse_op.kps |  10 +-
 .../fluid/operators/pull_gpups_sparse_op.cu   |   4 +-
 paddle/fluid/operators/quantize_linear_op.cu  |   2 +-
 paddle/fluid/operators/random_routing_op.cu   |   2 +-
 paddle/fluid/operators/rank_attention_op.cu   |   2 +-
 .../sequence_ops/sequence_enumerate_op.cu     |   4 +-
 .../sequence_ops/sequence_erase_op.cu         |   4 +-
 .../sequence_ops/sequence_expand_as_op.cu     |   2 +-
 .../sequence_ops/sequence_expand_op.cu        |   4 +-
 paddle/fluid/operators/shuffle_channel_op.cu  |   2 +-
 paddle/fluid/operators/temporal_shift_op.cu   |   2 +-
 paddle/fluid/operators/top_k_function_cuda.h  |   4 +-
 paddle/fluid/operators/transpose_op.cu.h      |   2 +-
 .../platform/device/gpu/cuda_helper_test.cu   |   8 +-
 .../platform/device/gpu/gpu_primitives.h      | 606 ------------------
 paddle/phi/backends/gpu/gpu_primitives.h      |   2 +-
 63 files changed, 193 insertions(+), 807 deletions(-)
 delete mode 100644 paddle/fluid/platform/device/gpu/gpu_primitives.h

diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index f05fe6c95de0a5..a273c4a5e8dda3 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -13,12 +13,12 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_HETERPS
 #include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace framework {
 
-const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
 #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
 
@@ -45,7 +45,7 @@ __global__ void PullCopy(float** dest,
     int x = low;
     int y = i - (x ? len[x - 1] : 0);
     float* feature_value_ptr =
-        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));
+        (float*)((char*)src + uint64_t(i) * uint64_t(max_val_size));  // NOLINT
     int mf_dim = gpu_dim[x] - 3;
     gpu_accessor.Select(
         dest[x] + y * (mf_dim + 3), feature_value_ptr, keys[x] + y, mf_dim);
@@ -79,8 +79,9 @@ __global__ void PullDedupCopy(const size_t N,
       return;
     }
 
-    float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *
-                                               uint64_t(max_val_size));
+    float* src_ptr =
+        (float*)((char*)src + uint64_t(restore_idx[i]) *.  // NOLINT
+                                                        uint64_t(max_val_size));
     switch (off) {
       case 0:
         *(dest_ptr + off) = src_ptr[accessor.ShowIndex()];
@@ -125,9 +126,10 @@ __global__ void PushCopyWithPool(float* dest,
     }
     int x = low;
     int y = i - (x ? len[low - 1] : 0);
-    float* cur = (float*)((char*)dest + i * grad_value_size);
+    float* cur = (float*)((char*)dest + i * grad_value_size);  // NOLINT
 
-    cur[gpu_accessor.common_push_value.SlotIndex()] = (float)slot_vector[x];
+    cur[gpu_accessor.common_push_value.SlotIndex()] =
+        (float)slot_vector[x];  // NOLINT
     int mf_dim = mf_dim_vector[x];
     cur[gpu_accessor.common_push_value.MfDimIndex()] = mf_dim;
 
@@ -170,31 +172,29 @@ __global__ void PushMergeCopyAtomic(const size_t N,
     int y = i - slot_lens[x];
 
     const float* ptr = src[x] + y * hidden;
-    float* cur = (float*)((char*)dest + d_restore_idx[i] * grad_value_size);
+    float* cur =
+        (float*)((char*)dest + d_restore_idx[i] * grad_value_size);  // NOLINT
     int mf_dim = slot_dims[x] - 3;
     switch (off) {
       case 0:
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];  // NOLINT
         cur[accessor.MfDimIndex()] = mf_dim;
-        paddle::platform::CudaAtomicAdd(&cur[accessor.ShowIndex()],
-                                        *(ptr + off));
+        phi::CudaAtomicAdd(&cur[accessor.ShowIndex()], *(ptr + off));
         break;
       case 1:
-        paddle::platform::CudaAtomicAdd(&cur[accessor.ClickIndex()],
-                                        *(ptr + off));
+        phi::CudaAtomicAdd(&cur[accessor.ClickIndex()], *(ptr + off));
         break;
       case 2:
-        paddle::platform::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
-                                        *(ptr + off) * -1. * bs);
+        phi::CudaAtomicAdd(&cur[accessor.EmbedGIndex()],
+                           *(ptr + off) * -1. * bs);
         break;
       default:
         int embedx_idx = off - 3;
         if (mf_dim < embedx_idx) {
           return;
         }
-        paddle::platform::CudaAtomicAdd(
-            &cur[accessor.EmbedxGIndex() + embedx_idx],
-            *(ptr + off) * -1. * bs);
+        phi::CudaAtomicAdd(&cur[accessor.EmbedxGIndex() + embedx_idx],
+                           *(ptr + off) * -1. * bs);
         break;
     }
   }
@@ -228,7 +228,7 @@ __global__ void PushMergeCopy(const size_t N,
     int i = idx / hidden;
     int off = idx % hidden;
     // filter 0 keys
-    float* cur = (float*)((char*)dest + i * grad_value_size);
+    float* cur = (float*)((char*)dest + i * grad_value_size);  // NOLINT
 
     if (total_keys[i] == 0) {
       switch (off) {
@@ -262,7 +262,7 @@ __global__ void PushMergeCopy(const size_t N,
 
     switch (off) {
       case 0:
-        cur[accessor.SlotIndex()] = (float)slot_vector[x];
+        cur[accessor.SlotIndex()] = (float)slot_vector[x];  // NOLINT
         cur[accessor.MfDimIndex()] = mf_dim;
         SUM_GRAD_VALUE
         cur[accessor.ShowIndex()] = val;
@@ -331,8 +331,8 @@ void AccessorWrapper<GPUAccessor>::CopyForPushImpl(
     const uint64_t total_length,
     const int batch_size,
     size_t grad_value_size,
-    std::vector<int>& slot_vector,
-    std::vector<int>& slot_mf_dim_vector) {
+    std::vector<int>& slot_vector,           // NOLINT
+    std::vector<int>& slot_mf_dim_vector) {  // NOLINT
   auto stream = dynamic_cast<phi::GPUContext*>(
                     paddle::platform::DeviceContextPool::Instance().Get(place))
                     ->stream();
diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
index 7f27b6889fc981..169b87b2b4017a 100644
--- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
@@ -22,12 +22,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace framework {
 
-const int CUDA_NUM_THREADS = platform::PADDLE_CUDA_NUM_THREADS;
+const int CUDA_NUM_THREADS = phi::PADDLE_CUDA_NUM_THREADS;
 #define GET_BLOCK(N) ((N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS)
 #define CUDA_BLOCK(N) GET_BLOCK(N), CUDA_NUM_THREADS, 0
 
diff --git a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
index e49bf16bf6878b..fe011422c19e95 100644
--- a/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.cu
@@ -20,8 +20,8 @@
 
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 #include "paddle/fluid/inference/tensorrt/plugin/fused_token_prune_op_plugin.h"
 #include "paddle/fluid/operators/fused_token_prune_op.cu.h"
@@ -149,7 +149,7 @@ __global__ void ReduceSum2<half>(
   }
 
   if (tid == 0) {
-    platform::fastAtomicAdd<platform::float16>(
+    phi::fastAtomicAdd<platform::float16>(
         reinterpret_cast<platform::float16*>(dst),
         static_cast<size_t>(batch * max_seq_len + col),
         static_cast<size_t>(bsz * max_seq_len),
diff --git a/paddle/fluid/operators/affine_channel_op.cu b/paddle/fluid/operators/affine_channel_op.cu
index cb7e7a8d12812a..16c297459ce046 100644
--- a/paddle/fluid/operators/affine_channel_op.cu
+++ b/paddle/fluid/operators/affine_channel_op.cu
@@ -23,7 +23,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/assign_pos_op.cu b/paddle/fluid/operators/assign_pos_op.cu
index f5704b6a086171..0f1107765d3844 100644
--- a/paddle/fluid/operators/assign_pos_op.cu
+++ b/paddle/fluid/operators/assign_pos_op.cu
@@ -23,8 +23,8 @@ We retain the following license from the original files:
 
 #include "paddle/fluid/operators/assign_pos_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 DECLARE_bool(avoid_op_randomness);
 
@@ -47,7 +47,7 @@ __global__ void AssignPos(T* cum_count,
   CUDA_KERNEL_LOOP(i, limit) {
     int number_idx = numbers[i];
     if (number_idx > -1) {
-      int p = platform::CudaAtomicAdd(cum_count + number_idx, -1);
+      int p = phi::CudaAtomicAdd(cum_count + number_idx, -1);
       out[p - 1] = i;
     }
   }
diff --git a/paddle/fluid/operators/batch_fc_op.cu b/paddle/fluid/operators/batch_fc_op.cu
index b8641565729a3d..178e57d7a261a1 100644
--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/bilateral_slice_op.cu b/paddle/fluid/operators/bilateral_slice_op.cu
index 1e0d0da5dbbad0..c995c3ed091dd3 100644
--- a/paddle/fluid/operators/bilateral_slice_op.cu
+++ b/paddle/fluid/operators/bilateral_slice_op.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/operators/bilateral_slice_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/center_loss_op.cu b/paddle/fluid/operators/center_loss_op.cu
index fed463d8f7cd75..44495ddf32eb38 100644
--- a/paddle/fluid/operators/center_loss_op.cu
+++ b/paddle/fluid/operators/center_loss_op.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/center_loss_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 namespace paddle {
 namespace operators {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
 __global__ void ComputeDifferent(T *centers_diff,
@@ -75,7 +75,7 @@ __global__ void UpdateCenters(T *centers,
     const T *diff = centers_diff + idy * D;
     T *cent = centers + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
+      phi::CudaAtomicAdd(&cent[i], alpha[0] * diff[i] / count);
     }
     idy += BlockDimY * GridDimX;
   }
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index 53aef8e8357343..e1fa8795d420e5 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -77,8 +77,7 @@ __global__ void CEmbeddingGrad(T *table,
     auto id = ids[row];
     if (id >= start_idx && id < end_idx) {
       auto real_idx = id - start_idx;
-      paddle::platform::CudaAtomicAdd(&table[real_idx * columns + col],
-                                      output[i]);
+      phi::CudaAtomicAdd(&table[real_idx * columns + col], output[i]);
     }
   }
 }
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index 2089c23fa6ec55..3539a7304010ec 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_split_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 689722d24eccba..047ef75d1fb39e 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/cvm_op.cu b/paddle/fluid/operators/cvm_op.cu
index 3db8c125ec1730..e8fdcec36082a1 100644
--- a/paddle/fluid/operators/cvm_op.cu
+++ b/paddle/fluid/operators/cvm_op.cu
@@ -16,12 +16,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/cvm_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;
 
diff --git a/paddle/fluid/operators/data_norm_op.cu b/paddle/fluid/operators/data_norm_op.cu
index b040b5dfd8d61d..790e55965a9d2a 100644
--- a/paddle/fluid/operators/data_norm_op.cu
+++ b/paddle/fluid/operators/data_norm_op.cu
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/data_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
@@ -29,7 +29,7 @@ namespace operators {
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;
 using DataLayout = phi::DataLayout;
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index 002f89b1620822..f1816850317a16 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -32,7 +32,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
@@ -41,7 +41,7 @@ namespace operators {
 
 using Tensor = phi::DenseTensor;
 using LoDTensor = phi::DenseTensor;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 static inline int GET_BLOCKS(const int N) {
   return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS;
@@ -447,18 +447,14 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
 
         // compute gradient of input
         if (bottom_data_diff) {
-          platform::CudaAtomicAdd(
-              bottom_data_diff + bottom_index + y0 * width + x0,
-              q00 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + bottom_index + y1 * width + x0,
-              q01 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + bottom_index + y0 * width + x1,
-              q10 * diff_val);
-          platform::CudaAtomicAdd(
-              bottom_data_diff + bottom_index + y1 * width + x1,
-              q11 * diff_val);
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x0,
+                             q00 * diff_val);
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x0,
+                             q01 * diff_val);
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y0 * width + x1,
+                             q10 * diff_val);
+          phi::CudaAtomicAdd(bottom_data_diff + bottom_index + y1 * width + x1,
+                             q11 * diff_val);
         }
 
         // compute gradient of trans
@@ -478,8 +474,8 @@ __global__ void DeformablePSROIPoolBackwardAccKernel(
                     u00 * (1 - dist_x)) *
                    trans_std * diff_val;
         diff_y *= roi_height;
-        platform::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
-        platform::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
+        phi::CudaAtomicAdd(bottom_trans_diff + trans_index_x, diff_x);
+        phi::CudaAtomicAdd(bottom_trans_diff + trans_index_y, diff_y);
       }
     }
   }
diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 360871f9e7251b..423363d49989ac 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index f7239b406b8fd4..8fc8ec221f3e82 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
index 6acc3845d24088..daae995de0d5ea 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/detection/box_decoder_and_assign_op.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index f244bdca35eb7c..18e52957d1acb7 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -26,8 +26,8 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/collect_fpn_proposals_op.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/gather.cu.h"
 
 namespace paddle {
@@ -50,7 +50,7 @@ static __global__ void GetLengthLoD(const int nthreads,
                                     const int* batch_ids,
                                     int* length_lod) {
   CUDA_KERNEL_LOOP(i, nthreads) {
-    platform::CudaAtomicAdd(length_lod + batch_ids[i], 1);
+    phi::CudaAtomicAdd(length_lod + batch_ids[i], 1);
   }
 }
 
diff --git a/paddle/fluid/operators/detection/polygon_box_transform_op.cu b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
index 49e3d3d96ba5d8..bbeb9f7f2858a9 100644
--- a/paddle/fluid/operators/detection/polygon_box_transform_op.cu
+++ b/paddle/fluid/operators/detection/polygon_box_transform_op.cu
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = phi::DenseTensor;
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 #define CUDA_BLOCK_SIZE 16
 
 template <typename T>
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index cd298e50cad69c..0c339b5f219f6e 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <algorithm>
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 using paddle::platform::float16;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index bad93fd22b2e9b..7971fa0acde3d9 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
 #include "paddle/fluid/operators/math.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index ecdec98339b42b..7bcd336732960e 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -43,7 +43,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
 #endif
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index a5f9f03493706c..b6dd3ca8f64b27 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/fake_quantize_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
index 60723c6cb5d17c..12e511fe3aef9d 100644
--- a/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
+++ b/paddle/fluid/operators/fused/fused_softmax_mask.cu.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/gather_scatter_kernel.cu b/paddle/fluid/operators/gather_scatter_kernel.cu
index 80dbce4b24d285..2f17b946c6149d 100644
--- a/paddle/fluid/operators/gather_scatter_kernel.cu
+++ b/paddle/fluid/operators/gather_scatter_kernel.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -35,7 +35,7 @@ class ReduceAdd {
       typename tensor_t,
       std::enable_if_t<!std::is_same<tensor_t, uint8_t>::value>* = nullptr>
   __device__ void operator()(tensor_t* self_data, tensor_t* src_data) const {
-    platform::CudaAtomicAdd(self_data, *src_data);
+    phi::CudaAtomicAdd(self_data, *src_data);
   }
   template <typename tensor_t,
             std::enable_if_t<std::is_same<tensor_t, uint8_t>::value>* = nullptr>
diff --git a/paddle/fluid/operators/graph_khop_sampler_op.cu b/paddle/fluid/operators/graph_khop_sampler_op.cu
index c83419f3092379..2e703282bf9324 100644
--- a/paddle/fluid/operators/graph_khop_sampler_op.cu
+++ b/paddle/fluid/operators/graph_khop_sampler_op.cu
@@ -41,8 +41,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/operators/graph_khop_sampler_imp.h"
 #include "paddle/fluid/operators/graph_khop_sampler_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 constexpr int WARP_SIZE = 32;
 
@@ -134,8 +134,7 @@ __global__ void GraphSampleNeighborsCUDAKernel(const uint64_t rand_seed,
         const int num = curand(&rng) % (idx + 1);
 #endif
         if (num < k) {
-          paddle::platform::CudaAtomicMax(output_idxs + out_row_start + num,
-                                          idx);
+          phi::CudaAtomicMax(output_idxs + out_row_start + num, idx);
         }
       }
 #ifdef PADDLE_WITH_CUDA
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 12a989bc82b1ca..8bb400107aac49 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -22,7 +22,7 @@ namespace cub = hipcub;
 
 #include "paddle/fluid/operators/group_norm_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -51,7 +51,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
   typedef cub::WarpReduce<T> WarpReduce;
   typename WarpReduce::TempStorage temp_storage;
   value = WarpReduce(temp_storage).Sum(value);
-  if (cub::LaneId() == 0) platform::CudaAtomicAdd(sum, value);
+  if (cub::LaneId() == 0) phi::CudaAtomicAdd(sum, value);
 }
 
 template <typename T>
@@ -429,14 +429,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
 
   if (flags & kHasScale) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
+    phi::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
 #else
     CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
 #endif
   }
   if (flags & kHasBias) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
+    phi::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
 #else
     CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
 #endif
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 0e6755dd0b7ba1..a589b49500e0ae 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/operators/interpolate_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -126,7 +126,7 @@ __global__ void KeNearestNeighborInterpBw(T* in,
                    in_img_idx * num_channels + channel_id];
     }
     const T out_pos = out[out_id_h * output_w + out_id_w];
-    platform::CudaAtomicAdd(in_pos, out_pos);
+    phi::CudaAtomicAdd(in_pos, out_pos);
   }
 }
 
@@ -243,12 +243,11 @@ __global__ void KeLinearInterpBw(T* in,
     const T* out_pos = &out[out_id_w];
 
     if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id], w1lambda * out_pos[0]);
     } else {
-      platform::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id * num_channels], w1lambda * out_pos[0]);
     }
   }
 }
@@ -408,19 +407,19 @@ __global__ void KeBilinearInterpBw(T* in,
     const T* out_pos = &out[out_id_h * output_w + out_id_w];
 
     if (data_layout == DataLayout::kNCHW) {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
-                              h1lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id], h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w],
+                         h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w + w_id],
+                         h1lambda * w1lambda * out_pos[0]);
     } else {
-      platform::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[w_id * num_channels],
-                              h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
-                              h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(&in_pos[0], h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[w_id * num_channels],
+                         h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos[h_id * in_img_w * num_channels],
+                         h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
           &in_pos[h_id * in_img_w * num_channels + w_id * num_channels],
           h1lambda * w1lambda * out_pos[0]);
     }
@@ -638,22 +637,22 @@ __global__ void KeTrilinearInterpBw(T* in,
       const T* out_pos = &out[out_id_h * output_w + out_id_w];
 
       // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
-                              d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
-                              d1lambda * h1lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[0],
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[w_id],
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w],
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w + w_id],
+                         d2lambda * h1lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[0],
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[w_id],
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w],
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w + w_id],
+                         d1lambda * h1lambda * w1lambda * out_pos[0]);
     } else {
       int in_pos1_idx = out_id_h * input_w +
                         in_img_idt * in_img_h * in_img_w * num_channels +
@@ -666,22 +665,22 @@ __global__ void KeTrilinearInterpBw(T* in,
       const T* out_pos = &out[out_id_h * output_w + out_id_w];
 
       // trilinear interpolation grad
-      platform::CudaAtomicAdd(&in_pos1[0],
-                              d2lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[w_id * num_channels],
-                              d2lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
-                              d2lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(&in_pos1[0],
+                         d2lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[w_id * num_channels],
+                         d2lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos1[h_id * in_img_w * num_channels],
+                         d2lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
           &in_pos1[h_id * in_img_w * num_channels + w_id * num_channels],
           d2lambda * h1lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[0],
-                              d1lambda * h2lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[w_id * num_channels],
-                              d1lambda * h2lambda * w1lambda * out_pos[0]);
-      platform::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
-                              d1lambda * h1lambda * w2lambda * out_pos[0]);
-      platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(&in_pos2[0],
+                         d1lambda * h2lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[w_id * num_channels],
+                         d1lambda * h2lambda * w1lambda * out_pos[0]);
+      phi::CudaAtomicAdd(&in_pos2[h_id * in_img_w * num_channels],
+                         d1lambda * h1lambda * w2lambda * out_pos[0]);
+      phi::CudaAtomicAdd(
           &in_pos2[h_id * in_img_w * num_channels + w_id * num_channels],
           d1lambda * h1lambda * w1lambda * out_pos[0]);
     }
@@ -903,8 +902,8 @@ __global__ void KeBicubicInterpBw(T* in,
           in_pos = &in[out_id_h * input_w + access_y * in_img_w * num_channels +
                        access_x * num_channels + channel_id];
         }
-        platform::CudaAtomicAdd(&in_pos[0],
-                                (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
+        phi::CudaAtomicAdd(&in_pos[0],
+                           (out_pos[0] * y_coeffs[j] * x_coeffs[i]));
       }
     }
   }
diff --git a/paddle/fluid/operators/limit_by_capacity_op.cu b/paddle/fluid/operators/limit_by_capacity_op.cu
index 4ca7a03b489be5..f6e0bffa1d1ce9 100644
--- a/paddle/fluid/operators/limit_by_capacity_op.cu
+++ b/paddle/fluid/operators/limit_by_capacity_op.cu
@@ -22,8 +22,8 @@
 
 #include "paddle/fluid/operators/limit_by_capacity_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ __global__ void limit_by_capacity_impl(
     wid = i / n_expert;
     eid = i % n_expert;
     auto proposal = expc[wid * n_expert + eid];
-    auto cap_left = paddle::platform::CudaAtomicAdd(cap + eid, proposal * (-1));
+    auto cap_left = phi::CudaAtomicAdd(cap + eid, proposal * (-1));
     if (cap_left >= proposal) {
       out[wid * n_expert + eid] = proposal;
     } else if (cap_left >= 0) {
diff --git a/paddle/fluid/operators/lookup_table_op.cu b/paddle/fluid/operators/lookup_table_op.cu
index 073077f6586fae..0562228f516fac 100644
--- a/paddle/fluid/operators/lookup_table_op.cu
+++ b/paddle/fluid/operators/lookup_table_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -93,7 +93,7 @@ __global__ void LookupTableGrad(T *table,
     const T *out = output + idy * D;
     T *tab = table + id * D;
     for (int i = idx; i < D; i += BlockDimX) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+      phi::CudaAtomicAdd(&tab[i], out[i]);
     }
     idy += BlockDimY * GridDimX;
   }
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cu b/paddle/fluid/operators/lookup_table_v2_op.cu
index 41be6b34e6e5b6..a3d8c91d862652 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cu
+++ b/paddle/fluid/operators/lookup_table_v2_op.cu
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/lookup_table_v2_op.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -65,10 +65,10 @@ __global__ void LookupTableV2Grad(T *table,
     const T *out = output + idy * D;
     T *tab = table + id * D;
 #ifdef PADDLE_WITH_CUDA
-    paddle::platform::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
+    phi::VectorizedAtomicAddPerBlock(D, idx, blockDim.x, out, tab);
 #else
     for (int i = idx; i < D; i += blockDim.x) {
-      paddle::platform::CudaAtomicAdd(&tab[i], out[i]);
+      phi::CudaAtomicAdd(&tab[i], out[i]);
     }
 #endif
     idy += blockDim.y * gridDim.x;
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cu b/paddle/fluid/operators/math/cos_sim_functor.cu
index cbe76844519a15..bb04df0879bf6f 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cu
+++ b/paddle/fluid/operators/math/cos_sim_functor.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -44,7 +44,7 @@ __global__ void CosSimDyKernel(const T* x_norm,
     for (size_t i = 0; i < cols; ++i) {
       T dy_data = dz_data * (x_data[i] * reciprocal_xy_norm_prod -
                              z_data * y[i] * reciprocal_y_norm_square);
-      platform::CudaAtomicAdd(dy + i, dy_data);
+      phi::CudaAtomicAdd(dy + i, dy_data);
     }
   }
 }
diff --git a/paddle/fluid/operators/math/cross_entropy.cu b/paddle/fluid/operators/math/cross_entropy.cu
index 0e5b95542455e3..6947801bf76eab 100644
--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/im2col.cu b/paddle/fluid/operators/math/im2col.cu
index 843e50c50a697b..5c7038714e93c6 100644
--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -466,8 +466,7 @@ __global__ void col2imOCF(const T* col_data,
 
         if (height_offset >= 0 && height_offset < im_height &&
             width_offset >= 0 && width_offset < im_width) {
-          paddle::platform::CudaAtomicAdd(im_data + im_offset,
-                                          col_data[col_offset]);
+          phi::CudaAtomicAdd(im_data + im_offset, col_data[col_offset]);
         }
       }
     }
diff --git a/paddle/fluid/operators/math/maxouting.cu b/paddle/fluid/operators/math/maxouting.cu
index df115fd16966db..9f1d2286395a47 100644
--- a/paddle/fluid/operators/math/maxouting.cu
+++ b/paddle/fluid/operators/math/maxouting.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/maxouting.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index eadf0d070b9018..530b68bbfbb3c9 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 
 #include "paddle/fluid/operators/math/sequence_pooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/sequence_scale.cu b/paddle/fluid/operators/math/sequence_scale.cu
index a0bb2a1ac33ced..21010ca33148e0 100644
--- a/paddle/fluid/operators/math/sequence_scale.cu
+++ b/paddle/fluid/operators/math/sequence_scale.cu
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/sequence_scale.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T, int BlockSize>
 __global__ void SequenceScaleKernel(T* seq,
diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu
index e3d7abb6e0d716..0ecac6c5fb07a7 100644
--- a/paddle/fluid/operators/math/unpooling.cu
+++ b/paddle/fluid/operators/math/unpooling.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/unpooling.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/vol2col.cu b/paddle/fluid/operators/math/vol2col.cu
index 765f31eba34f01..999e29470ebbd1 100644
--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 3e7f8a5363ac0b..e73496a46a0add 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "paddle/fluid/operators/mean_iou_op.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void CountCUDAKernel(const int num_classes,
diff --git a/paddle/fluid/operators/number_count_op.cu b/paddle/fluid/operators/number_count_op.cu
index dcbf95d059185d..25541ebdb36217 100644
--- a/paddle/fluid/operators/number_count_op.cu
+++ b/paddle/fluid/operators/number_count_op.cu
@@ -22,8 +22,8 @@
 
 #include "paddle/fluid/operators/number_count_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ __global__ void NumberCount(const T* numbers,
 #endif
     }
     if (threadIdx.x % WARP_SIZE == 0) {
-      platform::CudaAtomicAdd(number_count + i, x);
+      phi::CudaAtomicAdd(number_count + i, x);
     }
   }
 }
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 1a2939366f3910..b36ca97b3e40f9 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/fluid/operators/one_hot_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename InT, typename OutT>
 __global__ void FillOutputKernel(const InT* p_in_data,
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
index 05b00bac890a71..385e9a70e5489b 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cu
+++ b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/optimizers/sgd_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -56,7 +56,7 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
     for (int64_t index = threadIdx.x; index < row_numel; index += blockDim.x) {
       // Since index in rows of SelectedRows can be duplicate, we have to use
       // Atomic Operation to avoid concurrent write error.
-      paddle::platform::CudaAtomicAdd(
+      phi::CudaAtomicAdd(
           tensor_out_ptr + index,
           -static_cast<T>(1.0) * learning_rate[0] * selected_rows_ptr[index]);
     }
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index c76a6b61e780e9..7b0dd2149dead5 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -16,13 +16,13 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void Pad2DConstNCHW(const int nthreads,
@@ -257,9 +257,8 @@ __global__ void Pad2DGradReflectNCHW(const int out_size,
     in_w = max(in_w, -in_w);
     in_h = min(in_h, 2 * in_height - in_h - 2);
     in_w = min(in_w, 2 * in_width - in_w - 2);
-    platform::CudaAtomicAdd(
-        &d_in_data[(nc * in_height + in_h) * in_width + in_w],
-        d_out_data[out_index]);
+    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+                       d_out_data[out_index]);
   }
 }
 
@@ -288,7 +287,7 @@ __global__ void Pad2DGradReflectNHWC(const int out_size,
     in_w = max(in_w, -in_w);
     in_h = min(in_h, in_height * 2 - in_h - 2);
     in_w = min(in_w, in_width * 2 - in_w - 2);
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
         d_out_data[out_index]);
   }
@@ -313,9 +312,8 @@ __global__ void Pad2DGradEdgeNCHW(const int out_size,
     nc /= out_height;
     const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
     const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    platform::CudaAtomicAdd(
-        &d_in_data[(nc * in_height + in_h) * in_width + in_w],
-        d_out_data[out_index]);
+    phi::CudaAtomicAdd(&d_in_data[(nc * in_height + in_h) * in_width + in_w],
+                       d_out_data[out_index]);
   }
 }
 
@@ -340,7 +338,7 @@ __global__ void Pad2DGradEdgeNHWC(const int out_size,
     n /= out_height;
     const int in_h = min(in_height - 1, max(out_h - pad_top, 0));
     const int in_w = min(in_width - 1, max(out_w - pad_left, 0));
-    platform::CudaAtomicAdd(
+    phi::CudaAtomicAdd(
         &d_in_data[((n * in_height + in_h) * in_width + in_w) * channels + c],
         d_out_data[out_index]);
   }
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index d4375b2fc48ce4..07a2bde7e94e46 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 
 namespace paddle {
@@ -96,7 +96,7 @@ DEVICE void PrRoIPoolingDistributeDiff(T* diff,
                                        const T coeff) {
   bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
   if (!overflow) {
-    paddle::platform::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
+    phi::CudaAtomicAdd(diff + h * width + w, top_diff * coeff);
   }
 }
 #else
@@ -166,7 +166,7 @@ HOSTDEVICE void PrRoIPoolingMatDistributeDiff(T* diff,
 #if defined(__NVCC__) || defined(__HIPCC__)
 template <typename T>
 DEVICE void AccumulateRois(T* offset, T data) {
-  paddle::platform::CudaAtomicAdd(offset, data);
+  phi::CudaAtomicAdd(offset, data);
 }
 #else
 template <typename T>
diff --git a/paddle/fluid/operators/prune_gate_by_capacity_op.cu b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
index 9f5751fe0bdc7e..f21d640384c4d4 100644
--- a/paddle/fluid/operators/prune_gate_by_capacity_op.cu
+++ b/paddle/fluid/operators/prune_gate_by_capacity_op.cu
@@ -21,7 +21,7 @@
 //  Licensed under the Apache License, Version 2.0 (the "License").
 
 #include "paddle/fluid/operators/prune_gate_by_capacity_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
@@ -47,7 +47,7 @@ __global__ void prune_gate_by_capacity_kernel(const T1* gate_idx_data,
                                               const int64_t batch_size) {
   CUDA_KERNEL_LOOP(i, batch_size) {
     auto orig_cap =
-        platform::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
+        phi::CudaAtomicAdd(expert_count_data + gate_idx_data[i], -1);
     if (orig_cap <= 0) {
       new_gate_idx_data[i] = -1;
     } else {
diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
index 26a02ea622479f..cfa317a3d392fb 100644
--- a/paddle/fluid/operators/pull_box_extended_sparse_op.cu
+++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/operators/pull_box_extended_sparse_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pull_box_sparse_op.kps b/paddle/fluid/operators/pull_box_sparse_op.kps
index 6b7c7c8495108e..4b0580c5e1ab5c 100644
--- a/paddle/fluid/operators/pull_box_sparse_op.kps
+++ b/paddle/fluid/operators/pull_box_sparse_op.kps
@@ -37,7 +37,7 @@ limitations under the License. */
 #include "xpu/kernel/math.h"            // NOLINT
 #else
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #endif
 
 #include "paddle/fluid/operators/pull_box_sparse_op.h"
@@ -46,9 +46,13 @@ namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 #ifdef PADDLE_WITH_XPU_KP
-REGISTER_OP_KERNEL(pull_box_sparse, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(pull_box_sparse,
+                   KP,
+                   plat::XPUPlace,
                    ops::PullBoxSparseKernel<float>);
-REGISTER_OP_KERNEL(push_box_sparse, KP, plat::XPUPlace,
+REGISTER_OP_KERNEL(push_box_sparse,
+                   KP,
+                   plat::XPUPlace,
                    ops::PushBoxSparseKernel<float>);
 #else
 REGISTER_OP_CUDA_KERNEL(pull_box_sparse, ops::PullBoxSparseKernel<float>);
diff --git a/paddle/fluid/operators/pull_gpups_sparse_op.cu b/paddle/fluid/operators/pull_gpups_sparse_op.cu
index 996eacf428979d..d22c632d60dd25 100644
--- a/paddle/fluid/operators/pull_gpups_sparse_op.cu
+++ b/paddle/fluid/operators/pull_gpups_sparse_op.cu
@@ -14,11 +14,11 @@
 
 #include "paddle/fluid/operators/pull_gpups_sparse_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/quantize_linear_op.cu b/paddle/fluid/operators/quantize_linear_op.cu
index c5d8b1928fd78e..259c1507af038a 100644
--- a/paddle/fluid/operators/quantize_linear_op.cu
+++ b/paddle/fluid/operators/quantize_linear_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/fake_dequantize_op.cu.h"
 #include "paddle/fluid/operators/fake_quantize_op.cu.h"
 #include "paddle/fluid/operators/quantize_linear_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/random_routing_op.cu b/paddle/fluid/operators/random_routing_op.cu
index 287a523f61f3bc..1fdb1bf73a3047 100644
--- a/paddle/fluid/operators/random_routing_op.cu
+++ b/paddle/fluid/operators/random_routing_op.cu
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/operators/random_routing_op.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/rank_attention_op.cu b/paddle/fluid/operators/rank_attention_op.cu
index 36117e605031eb..8107e520b0492e 100644
--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
index 6a1afcc18e68c7..0f53f292ef8ae4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cu
@@ -16,11 +16,11 @@
 #include <thrust/host_vector.h>
 
 #include "paddle/fluid/operators/sequence_ops/sequence_enumerate_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
index 74789ecde9d386..d8b0afbc85dc59 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_erase_op.cu
@@ -16,11 +16,11 @@ limitations under the License. */
 #include <thrust/host_vector.h>
 
 #include "paddle/fluid/operators/sequence_ops/sequence_erase_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
-using platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 using LoDTensor = phi::DenseTensor;
 
 template <typename T>
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
index d5beedd35338ae..f565e0d438a0e6 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_as_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <algorithm>
 
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_as_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
index 2c9b6408a7390f..e4ebd47878cb2b 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/operators/sequence_ops/sequence_expand_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ __global__ void sequence_expand_grad_kernel(const T* dout_data,
     for (int tid_y = threadIdx.y; tid_y < x_item_count; tid_y += blockDim.y) {
       for (int tid_x = threadIdx.x; tid_x < x_item_length;
            tid_x += blockDim.x) {
-        platform::CudaAtomicAdd(
+        phi::CudaAtomicAdd(
             &dx_data[(x_offset + tid_y) * x_item_length + tid_x],
             dout_data[(out_offset + tid_z * x_item_count + tid_y) *
                           x_item_length +
diff --git a/paddle/fluid/operators/shuffle_channel_op.cu b/paddle/fluid/operators/shuffle_channel_op.cu
index 26eee095377c0a..4869a4c6c5e223 100644
--- a/paddle/fluid/operators/shuffle_channel_op.cu
+++ b/paddle/fluid/operators/shuffle_channel_op.cu
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/shuffle_channel_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index cad6f416d41d8d..d2583aeb143ec6 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -11,7 +11,7 @@
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
index faf2b08089157d..e95bca3c2791ef 100644
--- a/paddle/fluid/operators/top_k_function_cuda.h
+++ b/paddle/fluid/operators/top_k_function_cuda.h
@@ -28,8 +28,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 #define FINAL_MASK 0xffffffff
 #ifdef __HIPCC__
@@ -713,7 +713,7 @@ __device__ void RadixCountUsingMask(const T* input,
   if (GetLaneId() == 0) {
 #pragma unroll
     for (uint32_t i = 0; i < RadixSize; ++i) {
-      platform::CudaAtomicAdd(&shared_mem[i], counts[i]);
+      phi::CudaAtomicAdd(&shared_mem[i], counts[i]);
     }
   }
 
diff --git a/paddle/fluid/operators/transpose_op.cu.h b/paddle/fluid/operators/transpose_op.cu.h
index eb9e8a7bed7845..4fc610c393f103 100644
--- a/paddle/fluid/operators/transpose_op.cu.h
+++ b/paddle/fluid/operators/transpose_op.cu.h
@@ -16,9 +16,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/gpu_utils.h"
 #include "paddle/fluid/operators/transpose_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/fast_divmod.h"
 #include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/autotune/auto_tune_base.h"
 
diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
index 68229bba74ccd8..a3fff0dbed8e2e 100644
--- a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
+++ b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
@@ -24,17 +24,15 @@
 #define PADDLE_CUDA_FP16
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 using paddle::platform::float16;
-using paddle::platform::PADDLE_CUDA_NUM_THREADS;
+using phi::PADDLE_CUDA_NUM_THREADS;
 
 template <typename T>
 __global__ void AddKernel(const T* data_a, T* data_b, size_t num) {
-  CUDA_KERNEL_LOOP(i, num) {
-    paddle::platform::CudaAtomicAdd(&data_b[i], data_a[i]);
-  }
+  CUDA_KERNEL_LOOP(i, num) { phi::CudaAtomicAdd(&data_b[i], data_a[i]); }
 }
 
 template <typename T>
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
deleted file mode 100644
index 96eddf09237d98..00000000000000
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ /dev/null
@@ -1,606 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#endif
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#endif
-#include <stdio.h>
-
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace platform {
-
-#define CUDA_ATOMIC_WRAPPER(op, T) \
-  __device__ __forceinline__ T CudaAtomic##op(T *address, const T val)
-
-#define USE_CUDA_ATOMIC(op, T) \
-  CUDA_ATOMIC_WRAPPER(op, T) { return atomic##op(address, val); }
-
-// Default thread count per block(or block size).
-// TODO(typhoonzero): need to benchmark against setting this value
-//                    to 1024.
-constexpr int PADDLE_CUDA_NUM_THREADS = 512;
-
-// For atomicAdd.
-USE_CUDA_ATOMIC(Add, float);
-USE_CUDA_ATOMIC(Add, int);
-USE_CUDA_ATOMIC(Add, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-USE_CUDA_ATOMIC(Add, unsigned long long int);  // NOLINT
-
-CUDA_ATOMIC_WRAPPER(Add, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  return CudaAtomicAdd(
-      reinterpret_cast<unsigned long long int *>(address),  // NOLINT
-      static_cast<unsigned long long int>(val));            // NOLINT
-}
-
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600)
-USE_CUDA_ATOMIC(Add, double);
-#else
-CUDA_ATOMIC_WRAPPER(Add, double) {
-  unsigned long long int *address_as_ull =                  // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-
-  do {
-    assumed = old;
-    old = atomicCAS(address_as_ull,
-                    assumed,
-                    __double_as_longlong(val + __longlong_as_double(assumed)));
-
-    // Note: uses integer comparison to avoid hang in case of NaN
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-#endif
-
-#ifdef PADDLE_CUDA_FP16
-// NOTE(dzhwinter): cuda do not have atomicCAS for half.
-// Just use the half address as a unsigned value address and
-// do the atomicCAS. According to the value store at high 16 bits
-// or low 16 bits, then do a different sum and CAS.
-// Given most warp-threads will failed on the atomicCAS, so this
-// implemented should be avoided in high concurrency. It's will be
-// slower than the way convert value into 32bits and do a full atomicCAS.
-
-// convert the value into float and do the add arithmetic.
-// then store the result into a uint32.
-inline static __device__ uint32_t add_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // the float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-
-inline static __device__ uint32_t add_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // the float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-#if CUDA_VERSION >= 10000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
-static __device__ __forceinline__ float16 CUDAFP16ToPDFP16(__half x) {
-  return *reinterpret_cast<float16 *>(&x);
-}
-
-static __device__ __forceinline__ __half PDFP16ToCUDAFP16(float16 x) {
-  return *reinterpret_cast<__half *>(&x);
-}
-
-CUDA_ATOMIC_WRAPPER(Add, float16) {
-  return CUDAFP16ToPDFP16(
-      atomicAdd(reinterpret_cast<__half *>(address), PDFP16ToCUDAFP16(val)));
-}
-#else
-CUDA_ATOMIC_WRAPPER(Add, float16) {
-  // concrete packed float16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-
-// The performance of "atomicAdd(half* )" is bad, but for "atomicAdd(half2* )"
-// is good. So for fp16 type, we can use "atomicAdd(half2* )" to speed up.
-template <typename T,
-          typename std::enable_if<
-              std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void fastAtomicAdd(T *tensor,
-                                              size_t index,
-                                              const size_t numel,
-                                              T value) {
-#if ((CUDA_VERSION < 10000) || \
-     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
-  CudaAtomicAdd(reinterpret_cast<platform::float16 *>(tensor) + index,
-                static_cast<platform::float16>(value));
-#else
-  // whether the address is 32-byte aligned.
-  __half *target_addr = reinterpret_cast<__half *>(tensor + index);
-  bool aligned_half2 =
-      (reinterpret_cast<std::uintptr_t>(target_addr) % sizeof(__half2) == 0);
-
-  if (aligned_half2 && index < (numel - 1)) {
-    __half2 value2;
-    value2.x = *reinterpret_cast<__half *>(&value);
-    value2.y = __int2half_rz(0);
-    atomicAdd(reinterpret_cast<__half2 *>(target_addr), value2);
-
-  } else if (!aligned_half2 && index > 0) {
-    __half2 value2;
-    value2.x = __int2half_rz(0);
-    value2.y = *reinterpret_cast<__half *>(&value);
-    atomicAdd(reinterpret_cast<__half2 *>(target_addr - 1), value2);
-
-  } else {
-    atomicAdd(reinterpret_cast<__half *>(tensor) + index,
-              *reinterpret_cast<__half *>(&value));
-  }
-#endif
-}
-
-template <typename T,
-          typename std::enable_if<
-              !std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void fastAtomicAdd(T *arr,
-                                              size_t index,
-                                              const size_t numel,
-                                              T value) {
-  CudaAtomicAdd(arr + index, value);
-}
-#endif
-
-// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
-inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
-  bfloat16 low_half;
-  // the bfloat16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-
-inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
-  bfloat16 high_half;
-  // the bfloat16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
-  return *reinterpret_cast<bfloat16 *>(&x);
-}
-
-static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
-  return *reinterpret_cast<__nv_bfloat16 *>(&x);
-}
-
-CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
-  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
-                                    PDBF16ToCUDABF16(val)));
-}
-#else
-CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
-  // concrete packed bfloat16 value may exsits in lower or higher 16bits
-  // of the 32bits address.
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t sum;
-  uint32_t newval;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // the bfloat16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    bfloat16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // the bfloat16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(
-          address_as_ui, assumed, bf16_add_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    bfloat16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-
-CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
-  float *real = reinterpret_cast<float *>(address);
-  float *imag = real + 1;
-  return complex<float>(CudaAtomicAdd(real, val.real),
-                        CudaAtomicAdd(imag, val.imag));
-}
-
-CUDA_ATOMIC_WRAPPER(Add, complex<double>) {
-  double *real = reinterpret_cast<double *>(address);
-  double *imag = real + 1;
-  return complex<double>(CudaAtomicAdd(real, val.real),
-                         CudaAtomicAdd(imag, val.imag));
-}
-
-// For atomicMax
-USE_CUDA_ATOMIC(Max, int);
-USE_CUDA_ATOMIC(Max, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
-USE_CUDA_ATOMIC(Max, unsigned long long int);  // NOLINT
-#else
-CUDA_ATOMIC_WRAPPER(Max, unsigned long long int) {  // NOLINT
-  if (*address >= val) {
-    return *address;
-  }
-
-  unsigned long long int old = *address, assumed;  // NOLINT
-
-  do {
-    assumed = old;
-    if (assumed >= val) {
-      break;
-    }
-
-    old = atomicCAS(address, assumed, val);
-  } while (assumed != old);
-}
-#endif
-
-CUDA_ATOMIC_WRAPPER(Max, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  long long int res = *address;  // NOLINT
-  while (val > res) {
-    long long int old = res;                                           // NOLINT
-    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
-                                   (unsigned long long int)old,        // NOLINT
-                                   (unsigned long long int)val);       // NOLINT
-    if (res == old) {
-      break;
-    }
-  }
-  return res;
-}
-
-CUDA_ATOMIC_WRAPPER(Max, float) {
-  if (*address >= val) {
-    return *address;
-  }
-
-  int *const address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) >= val) {
-      break;
-    }
-
-    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
-  } while (assumed != old);
-
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Max, double) {
-  if (*address >= val) {
-    return *address;
-  }
-
-  unsigned long long int *const address_as_ull =            // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-
-  do {
-    assumed = old;
-    if (__longlong_as_double(assumed) >= val) {
-      break;
-    }
-
-    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-#ifdef PADDLE_CUDA_FP16
-inline static __device__ uint32_t max_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // The float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(max(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-
-inline static __device__ uint32_t max_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // The float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(max(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-CUDA_ATOMIC_WRAPPER(Max, float16) {
-  if (*address >= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, max_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, max_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-
-// For atomicMin
-USE_CUDA_ATOMIC(Min, int);
-USE_CUDA_ATOMIC(Min, unsigned int);
-// CUDA API uses unsigned long long int, we cannot use uint64_t here.
-// It because unsigned long long int is not necessarily uint64_t
-#if defined(__HIPCC__) || (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 350)
-USE_CUDA_ATOMIC(Min, unsigned long long int);  // NOLINT
-#else
-CUDA_ATOMIC_WRAPPER(Min, unsigned long long int) {  // NOLINT
-  if (*address <= val) {
-    return *address;
-  }
-
-  unsigned long long int old = *address, assumed;  // NOLINT
-
-  do {
-    assumed = old;
-    if (assumed <= val) {
-      break;
-    }
-
-    old = atomicCAS(address, assumed, val);
-  } while (assumed != old);
-}
-#endif
-
-CUDA_ATOMIC_WRAPPER(Min, int64_t) {
-  // Here, we check long long int must be int64_t.
-  static_assert(sizeof(int64_t) == sizeof(long long int),  // NOLINT
-                "long long should be int64");
-  long long int res = *address;  // NOLINT
-  while (val < res) {
-    long long int old = res;                                           // NOLINT
-    res = (long long int)atomicCAS((unsigned long long int *)address,  // NOLINT
-                                   (unsigned long long int)old,        // NOLINT
-                                   (unsigned long long int)val);       // NOLINT
-    if (res == old) {
-      break;
-    }
-  }
-  return res;
-}
-
-CUDA_ATOMIC_WRAPPER(Min, float) {
-  if (*address <= val) {
-    return *address;
-  }
-
-  int *const address_as_i = reinterpret_cast<int *>(address);
-  int old = *address_as_i, assumed;
-
-  do {
-    assumed = old;
-    if (__int_as_float(assumed) <= val) {
-      break;
-    }
-
-    old = atomicCAS(address_as_i, assumed, __float_as_int(val));
-  } while (assumed != old);
-
-  return __int_as_float(old);
-}
-
-CUDA_ATOMIC_WRAPPER(Min, double) {
-  if (*address <= val) {
-    return *address;
-  }
-
-  unsigned long long int *const address_as_ull =            // NOLINT
-      reinterpret_cast<unsigned long long int *>(address);  // NOLINT
-  unsigned long long int old = *address_as_ull, assumed;    // NOLINT
-
-  do {
-    assumed = old;
-    if (__longlong_as_double(assumed) <= val) {
-      break;
-    }
-
-    old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val));
-  } while (assumed != old);
-
-  return __longlong_as_double(old);
-}
-
-#ifdef PADDLE_CUDA_FP16
-inline static __device__ uint32_t min_to_low_half(uint32_t val, float x) {
-  float16 low_half;
-  // The float16 in lower 16bits
-  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
-  low_half = static_cast<float16>(min(static_cast<float>(low_half), x));
-  return (val & 0xFFFF0000u) | low_half.x;
-}
-
-inline static __device__ uint32_t min_to_high_half(uint32_t val, float x) {
-  float16 high_half;
-  // The float16 in higher 16bits
-  high_half.x = static_cast<uint16_t>(val >> 16);
-  high_half = static_cast<float16>(min(static_cast<float>(high_half), x));
-  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
-}
-
-CUDA_ATOMIC_WRAPPER(Min, float16) {
-  if (*address <= val) {
-    return *address;
-  }
-  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
-      reinterpret_cast<char *>(address) -
-      (reinterpret_cast<uintptr_t>(address) & 0x02));
-  float val_f = static_cast<float>(val);
-  uint32_t old = *address_as_ui;
-  uint32_t assumed;
-  if (((uintptr_t)address & 0x02) == 0) {
-    // The float16 value stay at lower 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, min_to_low_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old & 0xFFFFu;
-    return ret;
-  } else {
-    // The float16 value stay at higher 16 bits of the address.
-    do {
-      assumed = old;
-      old = atomicCAS(address_as_ui, assumed, min_to_high_half(assumed, val_f));
-    } while (old != assumed);
-    float16 ret;
-    ret.x = old >> 16;
-    return ret;
-  }
-}
-#endif
-
-#ifdef PADDLE_CUDA_FP16
-#ifdef PADDLE_WITH_CUDA
-/*
- * One thead block deals with elementwise atomicAdd for vector of len.
- * @in: [x1, x2, x3, ...]
- * @out:[y1+x1, y2+x2, y3+x3, ...]
- * */
-template <typename T,
-          typename std::enable_if<
-              !std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-  for (int i = tid; i < len; i += threads_per_block) {
-    CudaAtomicAdd(&out[i], in[i]);
-  }
-}
-
-// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
-template <typename T,
-          typename std::enable_if<
-              std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-#if ((CUDA_VERSION < 10000) || \
-     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
-  for (int i = tid; i < len; i += threads_per_block) {
-    CudaAtomicAdd(&out[i], in[i]);
-  }
-#else
-  int i = 0;
-  int loops = len / 2 * 2;
-
-  bool aligned_half2 =
-      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
-
-  if (aligned_half2) {
-    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
-      __half2 value2;
-      T value_1 = in[i];
-      T value_2 = in[i + 1];
-      value2.x = *reinterpret_cast<__half *>(&value_1);
-      value2.y = *reinterpret_cast<__half *>(&value_2);
-      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
-    }
-    for (; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  } else {
-    for (int i = tid; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  }
-#endif
-}
-#endif
-#endif
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/phi/backends/gpu/gpu_primitives.h b/paddle/phi/backends/gpu/gpu_primitives.h
index be08f29aa81502..2924b1b6bb3df2 100644
--- a/paddle/phi/backends/gpu/gpu_primitives.h
+++ b/paddle/phi/backends/gpu/gpu_primitives.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.

From fa0d9f919f6b01313532568962337a678e8596f1 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Wed, 16 Nov 2022 22:40:43 +0800
Subject: [PATCH 2/3] fix PR-CI-GpuPS fail

---
 paddle/fluid/framework/fleet/heter_ps/feature_value.cu | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
index a273c4a5e8dda3..80a827e6ad0e89 100644
--- a/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
+++ b/paddle/fluid/framework/fleet/heter_ps/feature_value.cu
@@ -79,9 +79,8 @@ __global__ void PullDedupCopy(const size_t N,
       return;
     }
 
-    float* src_ptr =
-        (float*)((char*)src + uint64_t(restore_idx[i]) *.  // NOLINT
-                                                        uint64_t(max_val_size));
+    float* src_ptr = (float*)((char*)src + uint64_t(restore_idx[i]) *  // NOLINT
+                                               uint64_t(max_val_size));
     switch (off) {
       case 0:
         *(dest_ptr + off) = src_ptr[accessor.ShowIndex()];

From 2f140abdc1e742eb7422b5cc76c133e5deaff6c5 Mon Sep 17 00:00:00 2001
From: Wang Xin <xinwang614@gmail.com>
Date: Thu, 17 Nov 2022 13:25:14 +0800
Subject: [PATCH 3/3] fix PR-CI-GpuPS fail

---
 paddle/fluid/operators/dequantize_log_op.cu               | 2 +-
 paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu
index 423363d49989ac..18719c76b2ef08 100644
--- a/paddle/fluid/operators/dequantize_log_op.cu
+++ b/paddle/fluid/operators/dequantize_log_op.cu
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/dequantize_log_op.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/math.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
index e8c14269878006..56d28c20dc8e76 100644
--- a/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
+++ b/paddle/fluid/operators/detection/sigmoid_focal_loss_op.cu
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/detection/sigmoid_focal_loss_op.h"
-#include "paddle/fluid/operators/math.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/core/hostdevice.h"
 #include "paddle/phi/kernels/funcs/math.h"