From e6038653575ab0daf637e8fbfb615c86a8bc3e90 Mon Sep 17 00:00:00 2001
From: yuguo <948529990@qq.com>
Date: Fri, 8 Dec 2023 14:29:11 +0800
Subject: [PATCH] fix rocm compile bugs (#59775)

* fix rocm compile bugs

* pre-commit fix

* fix rocm compile bugs
---
 .../new_executor/program_interpreter.cc       |   8 +-
 .../new_executor/program_interpreter.h        |   2 +-
 paddle/phi/core/distributed/nccl_comm_task.cc |  17 ++-
 paddle/phi/core/visit_type.h                  |   2 +-
 paddle/phi/kernels/autotune/gpu_timer.h       |  12 ++
 paddle/phi/kernels/funcs/dropout_impl.cu.h    |  15 ++-
 paddle/phi/kernels/funcs/fc_functor.cu        |   2 +
 ...dropout_residual_layer_norm_grad_kernel.cu |  11 +-
 ...bias_dropout_residual_layer_norm_kernel.cu |   7 ++
 .../fusion/gpu/fused_conv2d_add_act.cu        |   2 +
 .../gpu/fused_dropout_add_grad_kernel.cu      |  13 +++
 .../fusion/gpu/fused_dropout_add_kernel.cu    |  13 +++
 .../fusion/gpu/skip_layernorm_kernel.cu       |   7 +-
 paddle/phi/kernels/gpu/strided_copy_kernel.cu |  27 +++++
 .../phi/kernels/gpu/top_p_sampling_kernel.cu  | 109 ++++++++++++++++--
 .../kernels/impl/quant_linear_kernel_impl.h   |   5 +
 test/cpp/auto_parallel/CMakeLists.txt         |   7 +-
 .../pir/control_flow_dialect/CMakeLists.txt   |  42 +++----
 18 files changed, 252 insertions(+), 49 deletions(-)

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
index 8e30a631a6da8..d1ce9f55e4690 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.cc
+++ b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 
   PrepareForCUDAGraphCapture();
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
           !interpreter::IsCommunicationOp(instr_node)) {
@@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     }
 
     instr_node.RecordEvent(place_);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (instr_node.Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
index 2bc99114cc598..b19e3a06a4258 100644
--- a/paddle/fluid/framework/new_executor/program_interpreter.h
+++ b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 
diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
index 6bc002627a023..4e2efea0068eb 100644
--- a/paddle/phi/core/distributed/nccl_comm_task.cc
+++ b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -90,6 +90,7 @@ void NCCLCommTask::EndRecord() {
 #endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 void NCCLCommTask::ClearRecord() {
   if (start_event_created_) {
     backends::gpu::GPUDeviceGuard guard(place_.device);
@@ -102,8 +103,22 @@ void NCCLCommTask::ClearRecord() {
     end_event_created_ = false;
   }
 }
+#else  // PADDLE_WITH_HIP
+void NCCLCommTask::ClearRecord() {
+  if (start_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    HIP_CHECK(hipEventDestroy(nccl_start_event_));
+    start_event_created_ = false;
+  }
+  if (end_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    HIP_CHECK(hipEventDestroy(nccl_end_event_));
+    end_event_created_ = false;
+  }
+}
+#endif
 
-bool NCCLCommTask::CudaEventQuery(cudaEvent_t event) {
+bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
 #ifdef PADDLE_WITH_CUDA
   cudaError_t ret = cudaEventQuery(event);
   if (ret == cudaSuccess) {
diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
index 28f575295b47e..069e737d95607 100644
--- a/paddle/phi/core/visit_type.h
+++ b/paddle/phi/core/visit_type.h
@@ -150,7 +150,7 @@ namespace phi {
 
 ///////// BOOL and Floating and Integral Dispatch Marco ///////////
 
-#if NCCL_VERSION_CODE >= 21000
+#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
 #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
index 1882c21b9cd72..b04c46351c2cf 100644
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -32,6 +32,17 @@
 
 namespace phi {
 
+#ifdef PADDLE_WITH_HIP
+static void RecordEventTimerCallback(hipStream_t stream,
+                                     hipError_t status,
+                                     void *user_data) {
+  struct timeval time_now {};
+  gettimeofday(&time_now, nullptr);
+  double *cpu_time = static_cast<double *>(user_data);
+  *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+  VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
+}
+#else
 static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
                                                cudaError_t status,
                                                void *user_data) {
@@ -41,6 +52,7 @@ static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
   *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
   VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
 }
+#endif
 
 class GpuTimer {
  public:
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
index 1fc2f038aae64..0a89a3ca825ba 100644
--- a/paddle/phi/kernels/funcs/dropout_impl.cu.h
+++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -349,7 +349,19 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-
+#ifdef PADDLE_WITH_HIP
+      VectorizedRandomGenerator<T>
+          <<<grid_size, block_size, 0, stream>>>(0,
+                                                 size,
+                                                 seed_data,
+                                                 dropout_prob,
+                                                 x_data,
+                                                 mask_data,
+                                                 y_data,
+                                                 upscale_in_train,
+                                                 increment,
+                                                 main_offset);
+#else
       void* functionPtr =
           reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
       cudaFunction_t cudaFunc;
@@ -401,6 +413,7 @@ void DropoutFwGPUKernelDriver(
 
       VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
                << ", increment = " << increment;
+#endif
     }
   } else {
     if (upscale_in_train) {
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index 84a8cc309516b..107759313069d 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -373,6 +373,7 @@ template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;
 
+#ifndef PADDLE_WITH_HIP
 template <typename DeviceContext, typename T>
 void FCInt8Functor<DeviceContext, T>::operator()(
     const DeviceContext& context,
@@ -454,5 +455,6 @@ void FCInt8Functor<DeviceContext, T>::operator()(
 template class FCInt8Functor<GPUContext, float16>;
 template class FCInt8Functor<GPUContext, float>;
 template class FCInt8Functor<GPUContext, double>;
+#endif
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
index de277f3d21b2c..0f93e21553a74 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,17 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#ifndef PADDLE_WITH_HIP
 #include <cuda_fp16.h>
-
 #include <cub/cub.cuh>
+#endif
 
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#endif
 
 namespace phi {
 namespace fusion {
@@ -49,6 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
+#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
@@ -111,6 +114,10 @@ void FusedBiasDropoutResidualLnGradKernel(
       d_x_data,
       d_bias_data,
       d_residual_data);
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
+#endif
 }
 
 }  // namespace fusion
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
index 78c87a6794096..fd1f754cc9827 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,7 +17,9 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#endif
 
 namespace phi {
 namespace fusion {
@@ -40,6 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
+#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
@@ -92,6 +95,10 @@ void FusedBiasDropoutResidualLnKernel(
       y_data,
       ln_mean_data,
       ln_var_data);
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "FusedBiasDropoutResidualLnKernel not surpport for rocm"));
+#endif
 }
 }  // namespace fusion
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
index 6192b64768088..ac716fa174186 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_CUDA
 #include <xxhash.h>
 
 #include <algorithm>
@@ -649,3 +650,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act,  // cuda_only
                    float,
                    double,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
index 03ae48152a9e4..a3eb1483b4f74 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -202,6 +202,18 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
+#ifdef PADDLE_WITH_HIP
+    VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
+        <<<grid_size, block_size, 0, stream>>>(0,
+                                               numel,
+                                               seed_data,  //  idx: 2 need save
+                                               x_grad_data,
+                                               y_grad_data,
+                                               out_grad_data,
+                                               increment,  //  idx: 6 need save
+                                               main_offset,
+                                               functor);
+#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -239,6 +251,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
+#endif
   }
 }
 
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
index 5f21b7b05c3d9..17d9698df4d13 100644
--- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -186,6 +186,18 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
+#ifdef PADDLE_WITH_HIP
+    VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
+        <<<grid_size, block_size, 0, stream>>>(0,
+                                               numel,
+                                               seed_data,  // need save
+                                               x_data,
+                                               y_data,
+                                               out_data,
+                                               increment,  // need save
+                                               main_offset,
+                                               dst_functor);
+#else
     void* functionPtr = reinterpret_cast<void*>(
         &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
     cudaFunction_t cudaFunc;
@@ -241,6 +253,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
+#endif
   } else {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     MT factor = static_cast<MT>(1.0f - dropout_rate);
diff --git a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
index 1bb5c5dfb6301..4671534937a66 100644
--- a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
@@ -85,9 +85,6 @@ PD_REGISTER_KERNEL(skip_layernorm,
                    float,
                    phi::dtype::float16) {}
 #else
-PD_REGISTER_KERNEL(skip_layernorm,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::fusion::SkipLayerNormKernel,
-                   float){} {}
+PD_REGISTER_KERNEL(
+    skip_layernorm, GPU, ALL_LAYOUT, phi::fusion::SkipLayerNormKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
index ae173b5f03528..64cd37cd14b63 100644
--- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu
+++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
@@ -32,12 +32,21 @@ __global__ void StridedCopyCaseZeroFunc(
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
   int64_t input_offset = 0;
   int64_t output_offset = 0;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {
@@ -458,12 +467,21 @@ __global__ void Strided2ContiguousCaseZeroFunc(
                               blockDim.z * blockDim.y * blockDim.x +
                           threadIdx.z * blockDim.y * blockDim.x +
                           threadIdx.y * blockDim.x + threadIdx.x;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {
@@ -863,12 +881,21 @@ __global__ void Contiguous2StridedCaseZeroFunc(
                          threadIdx.z * blockDim.y * blockDim.x +
                          threadIdx.y * blockDim.x + threadIdx.x;
   int64_t output_offset = 0;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {
diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
index a78040eb6a669..91aa8e63246b3 100644
--- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
+++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu
@@ -14,10 +14,18 @@
 
 #include "paddle/phi/kernels/top_p_sampling_kernel.h"
 
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_runtime.h>
+#include <hiprand_kernel.h>
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
 #include <cuda_fp16.h>
 #include <curand_kernel.h>
+#include <cub/cub.cuh>
+#endif
 
-#include "cub/cub.cuh"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/core/kernel_registry.h"
@@ -49,6 +57,16 @@ struct DataTypeTraits<phi::dtype::float16> {
     __VA_ARGS__;                       \
   } break
 
+#ifdef PADDLE_WITH_HIP
+#define WARP_SIZE 64
+#define FIXED_BLOCK_DIM(...)                 \
+  FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \
+  FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__);  \
+  FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);
+#else
+#define WARP_SIZE 32
 #define FIXED_BLOCK_DIM(...)                 \
   FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \
   FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__);  \
@@ -56,6 +74,7 @@ struct DataTypeTraits<phi::dtype::float16> {
   FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__);  \
   FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__);   \
   FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__)
+#endif
 
 struct SegmentOffsetIter {
   explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {}
@@ -107,6 +126,16 @@ struct Pair {
 
 inline int div_up(int a, int n) { return (a + n - 1) / n; }
 
+#ifdef PADDLE_WITH_HIP
+__global__ void setup_kernel(hiprandState_t* state,
+                             const uint64_t seed,
+                             const int bs) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = idx; i < bs; i += gridDim.x * blockDim.x) {
+    hiprand_init(seed, i, 0, &state[i]);
+  }
+}
+#else
 __global__ void setup_kernel(curandState_t* state,
                              const uint64_t seed,
                              const int bs) {
@@ -115,6 +144,7 @@ __global__ void setup_kernel(curandState_t* state,
     curand_init(seed, i, 0, &state[i]);
   }
 }
+#endif
 
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[],
@@ -199,7 +229,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[],
 template <typename T>
 __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input) {
 #pragma unroll
-  for (int offset = 16; offset > 0; offset >>= 1) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) {
     T tmp_val =
         phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
     int tmp_id =
@@ -231,7 +261,7 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
       shared_max[wid] = input_now;
     }
     __syncthreads();
-    input_now = (tid < BlockSize / 32)
+    input_now = (tid < BlockSize / WARP_SIZE)
                     ? shared_max[lane]
                     : Pair<T>(std::numeric_limits<T>::min(), -1);
     if (wid == 0) {
@@ -259,12 +289,22 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
     if (MaxLength < 5) {
       if (*beam >= MaxLength) break;
     } else {
+#ifdef PADDLE_WITH_HIP
+      uint64 mask = 0;
+      mask = __ballot(true);
+      if (tid_max / WARP_SIZE == wid) {
+        if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
+          break;
+      }
+#else
       unsigned mask = 0u;
       mask = __ballot_sync(FINAL_MASK, true);
-      if (tid_max / 32 == wid) {
-        if (__shfl_down_sync(FINAL_MASK, *beam, tid_max % 32, 32) == MaxLength)
+      if (tid_max / WARP_SIZE == wid) {
+        if (__shfl_down_sync(
+                FINAL_MASK, *beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength)
           break;
       }
+#endif
     }
   }
 }
@@ -276,12 +316,16 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
                                      int64_t* out_id,  // topk id
                                      T* out_val,       // topk val
                                      int vocab_size,
+#ifdef PADDLE_WITH_HIP
+                                     hiprandState_t* state,
+#else
                                      curandState_t* state,
+#endif
                                      int* count_iter,
                                      int* count_iter_begin) {
   const int tid = threadIdx.x;
-  const int wid = tid / 32;
-  const int lane = tid % 32;
+  const int wid = tid / WARP_SIZE;
+  const int lane = tid % WARP_SIZE;
   const int bid = blockIdx.x;
   const float threshold_now =
       threshold ? static_cast<float>(threshold[bid]) : 0.f;
@@ -289,7 +333,7 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
   int top_num = TopPBeamTopK;
   float top_p_num = static_cast<float>(top_ps[bid]);
 
-  __shared__ Pair<T> shared_max[BlockSize / 32];
+  __shared__ Pair<T> shared_max[BlockSize / WARP_SIZE];
   __shared__ Pair<T> beam_max[TopPBeamTopK];
 
   Pair<T> topk[MaxLength];
@@ -322,7 +366,11 @@ __global__ void KeMatrixTopPBeamTopK(const T* src,
   }
   if (tid == 0) {
     count_iter_begin[bid] = count_iter[bid];
+#ifdef PADDLE_WITH_HIP
+    float rand_top_p = hiprand_uniform(state + bid) * top_p_num;
+#else
     float rand_top_p = curand_uniform(state + bid) * top_p_num;
+#endif
     top_ps[bid] = (T)rand_top_p;
     float sum_prob = 0.0f;
 
@@ -421,9 +469,9 @@ __global__ void topp_sampling(T* sorted_probs,
   __shared__ float rand_p;
   const int tid = threadIdx.x;
   const int bid = blockIdx.x;
-  constexpr int NUM_WARPS = BLOCK_SIZE / 32;
-  const int lane_id = tid % 32;
-  const int warp_id = tid / 32;
+  constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE;
+  const int lane_id = tid % WARP_SIZE;
+  const int warp_id = tid / WARP_SIZE;
   const float p_t = static_cast<float>(top_ps[bid]);
   const float threshold_now =
       threshold ? static_cast<float>(threshold[bid]) : 0.f;
@@ -444,7 +492,11 @@ __global__ void topp_sampling(T* sorted_probs,
   typedef cub::BlockReduce<int, BLOCK_SIZE> BlockReduce;
   __shared__ typename BlockScan::TempStorage temp_storage;
   __shared__ typename BlockReduce::TempStorage temp_storage_reduce;
+#ifdef PADDLE_WITH_HIP
+  __shared__ uint64_t selected_shared[NUM_WARPS];
+#else
   __shared__ uint32_t selected_shared[NUM_WARPS];
+#endif
   int threshold_id = 0;
 
   // Initialize running total
@@ -468,7 +520,11 @@ __global__ void topp_sampling(T* sorted_probs,
     BlockScan(temp_storage)
         .InclusiveSum(thread_count, thread_offset, prefix_op);
 
+#ifdef PADDLE_WITH_HIP
+    uint64_t activate_mask = __ballot(rand_p <= thread_offset);
+#else
     uint32_t activate_mask = __ballot_sync(FINAL_MASK, rand_p <= thread_offset);
+#endif
 
     i_activate = i;
     if (activate_mask != 0) {
@@ -506,9 +562,15 @@ __global__ void topp_sampling(T* sorted_probs,
         // don't sample low score token
         int max_id =
             BlockReduce(temp_storage_reduce).Reduce(threshold_id, MaxOp<int>());
+#ifdef PADDLE_WITH_HIP
+        hiprandStatePhilox4_32_10_t rng;
+        hiprand_init(seed, tid, 0, &rng);
+        int random_id = hiprand(&rng) % (max_id + 1);
+#else
         curandStatePhilox4_32_10_t rng;
         curand_init(seed, tid, 0, &rng);
         int random_id = curand(&rng) % (max_id + 1);
+#endif
         out_id[bid] = sorted_id[offset + random_id];
         out_val[bid] = sorted_probs[offset + random_id];
       } else {
@@ -537,6 +599,19 @@ __global__ void set_sorted_num(int* need_sorted_num, int bs) {
   *need_sorted_num = bs;
 }
 
+#ifdef PADDLE_WITH_HIP
+template <typename T>
+__global__ void print_kernel(T* input, int size) {
+  for (int i = 0; i < size; i++) {
+    printf("[");
+    if (i != size - 1) {
+      printf("%f, ", static_cast<float>(input[i]));
+    } else {
+      printf("%f]\n", static_cast<float>(input[i]));
+    }
+  }
+}
+#else
 template <typename T>
 __global__ void print_kernel(T* input, int size) {
   for (int i = 0; i < size; i++) {
@@ -550,6 +625,7 @@ __global__ void print_kernel(T* input, int size) {
     VLOG(0) << ss.str();
   }
 }
+#endif
 
 template <typename T>
 T* SafeGetTensorPtr(const DenseTensor& t) {
@@ -612,6 +688,16 @@ void TopPSamplingKernel(const Context& dev_ctx,
       PD_THROW("the input data shape has error in the FillIndex kernel.");
   }
 
+#ifdef PADDLE_WITH_HIP
+  hiprandState_t* dev_curand_states;
+  phi::Allocator::AllocationPtr curand_states_buf{nullptr};
+  curand_states_buf = phi::memory_utils::Alloc(
+      dev_ctx.GetPlace(),
+      bs * sizeof(hiprandState_t),
+      phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+  dev_curand_states =
+      reinterpret_cast<hiprandState_t*>(curand_states_buf->ptr());
+#else
   curandState_t* dev_curand_states;
   phi::Allocator::AllocationPtr curand_states_buf{nullptr};
   curand_states_buf = phi::memory_utils::Alloc(
@@ -620,6 +706,7 @@ void TopPSamplingKernel(const Context& dev_ctx,
       phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
   dev_curand_states =
       reinterpret_cast<curandState_t*>(curand_states_buf->ptr());
+#endif
   uint64_t seed;
   if (random_seed == -1) {
     seed = static_cast<uint64_t>(time(NULL) % 1000000);
diff --git a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
index f48e871dce165..1948c928733e6 100644
--- a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
+++ b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h
@@ -76,6 +76,10 @@ void QuantLinearKernel(const Context& dev_ctx,
           "The weight's datatype is expected to be int8 when use quant. But "
           "received weight's datatype is %d",
           static_cast<int>(w.dtype())));
+#ifdef PADDLE_WITH_HIP
+  PADDLE_THROW(
+      phi::errors::Unimplemented("FCInt8Functor not surpport for rocm"));
+#else
   phi::funcs::FCInt8Functor<Context, T> fc;
   fc(dev_ctx,
      M,
@@ -92,6 +96,7 @@ void QuantLinearKernel(const Context& dev_ctx,
      bias_data,
      with_relu,
      padding_weights);
+#endif
   return;
 }
 
diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt
index 7c2a329172107..c736ad85007b6 100644
--- a/test/cpp/auto_parallel/CMakeLists.txt
+++ b/test/cpp/auto_parallel/CMakeLists.txt
@@ -35,6 +35,7 @@ if(WITH_DISTRIBUTE)
 
 endif()
 
-
-cc_test(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi)
-
+cc_test(
+  dist_mapper_test
+  SRCS dist_mapper_test.cc
+  DEPS phi)
diff --git a/test/cpp/pir/control_flow_dialect/CMakeLists.txt b/test/cpp/pir/control_flow_dialect/CMakeLists.txt
index 9309a8a774b29..fe751953c8fc4 100644
--- a/test/cpp/pir/control_flow_dialect/CMakeLists.txt
+++ b/test/cpp/pir/control_flow_dialect/CMakeLists.txt
@@ -1,21 +1,23 @@
-cc_test_old(
-  if_op_test
-  SRCS
-  if_op_test.cc
-  DEPS
-  pir
-  op_dialect_vjp
-  pir_transforms
-  op_dialect
-  gtest)
+if(NOT WITH_ROCM)
+  cc_test_old(
+    if_op_test
+    SRCS
+    if_op_test.cc
+    DEPS
+    pir
+    op_dialect_vjp
+    pir_transforms
+    op_dialect
+    gtest)
 
-cc_test_old(
-  while_op_test
-  SRCS
-  while_op_test.cc
-  DEPS
-  pir
-  op_dialect_vjp
-  pir_transforms
-  op_dialect
-  gtest)
+  cc_test_old(
+    while_op_test
+    SRCS
+    while_op_test.cc
+    DEPS
+    pir
+    op_dialect_vjp
+    pir_transforms
+    op_dialect
+    gtest)
+endif()