PaddlePaddle · YanhuiDua · Dec 8, 2023 · Dec 6, 2023 · Dec 6, 2023 · Dec 6, 2023
diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc
@@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,
 
   PrepareForCUDAGraphCapture();
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
 #endif
 }
@@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {
 
 std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
   double start_time = 0, end_time = 0;
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   start_time = calculate_stream_timer_->StartTime();
   end_time = calculate_stream_timer_->EndTime();
 #endif
@@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
 
   try {
     instr_node.WaitEvent(place_);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
           !interpreter::IsCommunicationOp(instr_node)) {
@@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
     }
 
     instr_node.RecordEvent(place_);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (enable_job_schedule_profiler_) {
       if (instr_node.Id() == last_calculate_instr_id_ &&
           calculate_stream_timer_->IsStarted()) {

diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h
@@ -16,7 +16,7 @@
 
 #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/phi/kernels/autotune/gpu_timer.h"
 #endif
 

diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc
@@ -90,6 +90,7 @@ void NCCLCommTask::EndRecord() {
 #endif
 }
 
+#ifdef PADDLE_WITH_CUDA
 void NCCLCommTask::ClearRecord() {
   if (start_event_created_) {
     backends::gpu::GPUDeviceGuard guard(place_.device);
@@ -102,8 +103,22 @@ void NCCLCommTask::ClearRecord() {
     end_event_created_ = false;
   }
 }
+#else  // PADDLE_WITH_HIP
+void NCCLCommTask::ClearRecord() {
+  if (start_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    HIP_CHECK(hipEventDestroy(nccl_start_event_));
+    start_event_created_ = false;
+  }
+  if (end_event_created_) {
+    backends::gpu::GPUDeviceGuard guard(place_.device);
+    HIP_CHECK(hipEventDestroy(nccl_end_event_));
+    end_event_created_ = false;
+  }
+}
+#endif
 
-bool NCCLCommTask::CudaEventQuery(cudaEvent_t event) {
+bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
 #ifdef PADDLE_WITH_CUDA
   cudaError_t ret = cudaEventQuery(event);
   if (ret == cudaSuccess) {

diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h
@@ -150,7 +150,7 @@ namespace phi {
 
 ///////// BOOL and Floating and Integral Dispatch Marco ///////////
 
-#if NCCL_VERSION_CODE >= 21000
+#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
 #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...)    \
   [&] {                                                                       \
     const auto& __dtype__ = TYPE;                                             \

diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -32,6 +32,17 @@
 
 namespace phi {
 
+#ifdef PADDLE_WITH_HIP
+static void RecordEventTimerCallback(hipStream_t stream,
+                                     hipError_t status,
+                                     void *user_data) {
+  struct timeval time_now {};
+  gettimeofday(&time_now, nullptr);
+  double *cpu_time = static_cast<double *>(user_data);
+  *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
+  VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
+}
+#else
 static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
                                                cudaError_t status,
                                                void *user_data) {
@@ -41,6 +52,7 @@ static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
   *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
   VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
 }
+#endif
 
 class GpuTimer {
  public:

diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -349,7 +349,19 @@ void DropoutFwGPUKernelDriver(
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
-
+#ifdef PADDLE_WITH_HIP
+      VectorizedRandomGenerator<T>
+          <<<grid_size, block_size, 0, stream>>>(0,
+                                                 size,
+                                                 seed_data,
+                                                 dropout_prob,
+                                                 x_data,
+                                                 mask_data,
+                                                 y_data,
+                                                 upscale_in_train,
+                                                 increment,
+                                                 main_offset);
+#else
       void* functionPtr =
           reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
       cudaFunction_t cudaFunc;
@@ -401,6 +413,7 @@ void DropoutFwGPUKernelDriver(
 
       VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
                << ", increment = " << increment;
+#endif
     }
   } else {
     if (upscale_in_train) {

diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -373,6 +373,7 @@ template class FCFunctor<GPUContext, float16>;
 template class FCFunctor<GPUContext, float>;
 template class FCFunctor<GPUContext, double>;
 
+#ifndef PADDLE_WITH_HIP
 template <typename DeviceContext, typename T>
 void FCInt8Functor<DeviceContext, T>::operator()(
     const DeviceContext& context,
@@ -454,5 +455,6 @@ void FCInt8Functor<DeviceContext, T>::operator()(
 template class FCInt8Functor<GPUContext, float16>;
 template class FCInt8Functor<GPUContext, float>;
 template class FCInt8Functor<GPUContext, double>;
+#endif
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu
@@ -11,17 +11,19 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-
+#ifndef PADDLE_WITH_HIP
 #include <cuda_fp16.h>
-
 #include <cub/cub.cuh>
+#endif
 
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#endif
 
 namespace phi {
 namespace fusion {
@@ -49,6 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
     DenseTensor* bias_grad,
     DenseTensor* ln_scale_grad,
     DenseTensor* ln_bias_grad) {
+#ifndef PADDLE_WITH_HIP
   using U = LayerNormParamType<T>;
   auto* d_y_data = y_grad.data<T>();
   auto* ln_scale_data =
@@ -111,6 +114,10 @@ void FusedBiasDropoutResidualLnGradKernel(
       d_x_data,
       d_bias_data,
       d_residual_data);
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
+#endif
 }
 
 }  // namespace fusion

diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu
@@ -17,7 +17,9 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
+#ifndef PADDLE_WITH_HIP
 #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
+#endif
 
 namespace phi {
 namespace fusion {
@@ -40,6 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
     DenseTensor* dropout_mask_out,
     DenseTensor* ln_mean,
     DenseTensor* ln_variance) {
+#ifndef PADDLE_WITH_HIP
   using U = phi::funcs::LayerNormParamType<T>;
   auto* x_data = x.data<T>();
   auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
@@ -92,6 +95,10 @@ void FusedBiasDropoutResidualLnKernel(
       y_data,
       ln_mean_data,
       ln_var_data);
+#else
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "FusedBiasDropoutResidualLnKernel not surpport for rocm"));
+#endif
 }
 }  // namespace fusion
 }  // namespace phi

diff --git a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#ifdef PADDLE_WITH_CUDA
 #include <xxhash.h>
 
 #include <algorithm>
@@ -649,3 +650,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act,  // cuda_only
                    float,
                    double,
                    phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
@@ -202,6 +202,18 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
                        ? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
                        : NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);
 
+#ifdef PADDLE_WITH_HIP
+    VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
+        <<<grid_size, block_size, 0, stream>>>(0,
+                                               numel,
+                                               seed_data,  //  idx: 2 need save
+                                               x_grad_data,
+                                               y_grad_data,
+                                               out_grad_data,
+                                               increment,  //  idx: 6 need save
+                                               main_offset,
+                                               functor);
+#else
     // we assume seed/offset is same across iterations
     // seed_offset_data should preserved by cudaGraph pool
     const phi::GPUContext* dev_ctx_p = &dev_ctx;
@@ -239,6 +251,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
+#endif
   }
 }
 

diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
@@ -186,6 +186,18 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
     auto dst_functor =
         NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);
 
+#ifdef PADDLE_WITH_HIP
+    VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
+        <<<grid_size, block_size, 0, stream>>>(0,
+                                               numel,
+                                               seed_data,  // need save
+                                               x_data,
+                                               y_data,
+                                               out_data,
+                                               increment,  // need save
+                                               main_offset,
+                                               dst_functor);
+#else
     void* functionPtr = reinterpret_cast<void*>(
         &(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
     cudaFunction_t cudaFunc;
@@ -241,6 +253,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
 
     VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
              << ", increment = " << increment;
+#endif
   } else {
     using MT = typename phi::dtype::MPTypeTrait<T>::Type;
     MT factor = static_cast<MT>(1.0f - dropout_rate);

diff --git a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
@@ -85,9 +85,6 @@ PD_REGISTER_KERNEL(skip_layernorm,
                    float,
                    phi::dtype::float16) {}
 #else
-PD_REGISTER_KERNEL(skip_layernorm,
-                   GPU,
-                   ALL_LAYOUT,
-                   phi::fusion::SkipLayerNormKernel,
-                   float){} {}
+PD_REGISTER_KERNEL(
+    skip_layernorm, GPU, ALL_LAYOUT, phi::fusion::SkipLayerNormKernel, float) {}
 #endif
diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu
@@ -32,12 +32,21 @@ __global__ void StridedCopyCaseZeroFunc(
     phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
   int64_t input_offset = 0;
   int64_t output_offset = 0;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {
@@ -458,12 +467,21 @@ __global__ void Strided2ContiguousCaseZeroFunc(
                               blockDim.z * blockDim.y * blockDim.x +
                           threadIdx.z * blockDim.y * blockDim.x +
                           threadIdx.y * blockDim.x + threadIdx.x;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {
@@ -863,12 +881,21 @@ __global__ void Contiguous2StridedCaseZeroFunc(
                          threadIdx.z * blockDim.y * blockDim.x +
                          threadIdx.y * blockDim.x + threadIdx.x;
   int64_t output_offset = 0;
+#ifdef PADDLE_WITH_HIP
+  int64_t coordinate[6] = {threadIdx.x,
+                           threadIdx.y,
+                           threadIdx.z,
+                           blockIdx.x,
+                           blockIdx.y,
+                           blockIdx.z};
+#else
   float coordinate[6] = {threadIdx.x,
                          threadIdx.y,
                          threadIdx.z,
                          blockIdx.x,
                          blockIdx.y,
                          blockIdx.z};
+#endif
 
 #pragma unroll
   for (int dim = RANK - 1; dim >= 0; --dim) {