From e6038653575ab0daf637e8fbfb615c86a8bc3e90 Mon Sep 17 00:00:00 2001 From: yuguo <948529990@qq.com> Date: Fri, 8 Dec 2023 14:29:11 +0800 Subject: [PATCH] fix rocm compile bugs (#59775) * fix rocm compile bugs * pre-commit fix * fix rocm compile bugs --- .../new_executor/program_interpreter.cc | 8 +- .../new_executor/program_interpreter.h | 2 +- paddle/phi/core/distributed/nccl_comm_task.cc | 17 ++- paddle/phi/core/visit_type.h | 2 +- paddle/phi/kernels/autotune/gpu_timer.h | 12 ++ paddle/phi/kernels/funcs/dropout_impl.cu.h | 15 ++- paddle/phi/kernels/funcs/fc_functor.cu | 2 + ...dropout_residual_layer_norm_grad_kernel.cu | 11 +- ...bias_dropout_residual_layer_norm_kernel.cu | 7 ++ .../fusion/gpu/fused_conv2d_add_act.cu | 2 + .../gpu/fused_dropout_add_grad_kernel.cu | 13 +++ .../fusion/gpu/fused_dropout_add_kernel.cu | 13 +++ .../fusion/gpu/skip_layernorm_kernel.cu | 7 +- paddle/phi/kernels/gpu/strided_copy_kernel.cu | 27 +++++ .../phi/kernels/gpu/top_p_sampling_kernel.cu | 109 ++++++++++++++++-- .../kernels/impl/quant_linear_kernel_impl.h | 5 + test/cpp/auto_parallel/CMakeLists.txt | 7 +- .../pir/control_flow_dialect/CMakeLists.txt | 42 +++---- 18 files changed, 252 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/framework/new_executor/program_interpreter.cc b/paddle/fluid/framework/new_executor/program_interpreter.cc index 8e30a631a6da8..d1ce9f55e4690 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.cc +++ b/paddle/fluid/framework/new_executor/program_interpreter.cc @@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place, PrepareForCUDAGraphCapture(); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) calculate_stream_timer_ = std::make_unique(place); #endif } @@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() { std::tuple ProgramInterpreter::InterpreterRunTime() { double start_time = 0, end_time = 0; -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) start_time = calculate_stream_timer_->StartTime(); end_time = calculate_stream_timer_->EndTime(); #endif @@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { try { instr_node.WaitEvent(place_); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" && !interpreter::IsCommunicationOp(instr_node)) { @@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) { } instr_node.RecordEvent(place_); -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (enable_job_schedule_profiler_) { if (instr_node.Id() == last_calculate_instr_id_ && calculate_stream_timer_->IsStarted()) { diff --git a/paddle/fluid/framework/new_executor/program_interpreter.h b/paddle/fluid/framework/new_executor/program_interpreter.h index 2bc99114cc598..b19e3a06a4258 100644 --- a/paddle/fluid/framework/new_executor/program_interpreter.h +++ b/paddle/fluid/framework/new_executor/program_interpreter.h @@ -16,7 +16,7 @@ #include "paddle/fluid/framework/new_executor/interpreter_base_impl.h" -#if defined(PADDLE_WITH_CUDA) +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #include "paddle/phi/kernels/autotune/gpu_timer.h" #endif diff --git a/paddle/phi/core/distributed/nccl_comm_task.cc b/paddle/phi/core/distributed/nccl_comm_task.cc index 6bc002627a023..4e2efea0068eb 100644 --- a/paddle/phi/core/distributed/nccl_comm_task.cc +++ b/paddle/phi/core/distributed/nccl_comm_task.cc @@ -90,6 +90,7 @@ void NCCLCommTask::EndRecord() { #endif } +#ifdef PADDLE_WITH_CUDA void NCCLCommTask::ClearRecord() { if (start_event_created_) { backends::gpu::GPUDeviceGuard guard(place_.device); @@ -102,8 +103,22 @@ void NCCLCommTask::ClearRecord() { end_event_created_ = false; } } +#else // PADDLE_WITH_HIP +void NCCLCommTask::ClearRecord() { + if (start_event_created_) { + backends::gpu::GPUDeviceGuard guard(place_.device); + HIP_CHECK(hipEventDestroy(nccl_start_event_)); + start_event_created_ = false; + } + if (end_event_created_) { + backends::gpu::GPUDeviceGuard guard(place_.device); + HIP_CHECK(hipEventDestroy(nccl_end_event_)); + end_event_created_ = false; + } +} +#endif -bool NCCLCommTask::CudaEventQuery(cudaEvent_t event) { +bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) { #ifdef PADDLE_WITH_CUDA cudaError_t ret = cudaEventQuery(event); if (ret == cudaSuccess) { diff --git a/paddle/phi/core/visit_type.h b/paddle/phi/core/visit_type.h index 28f575295b47e..069e737d95607 100644 --- a/paddle/phi/core/visit_type.h +++ b/paddle/phi/core/visit_type.h @@ -150,7 +150,7 @@ namespace phi { ///////// BOOL and Floating and Integral Dispatch Marco /////////// -#if NCCL_VERSION_CODE >= 21000 +#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL) #define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \ [&] { \ const auto& __dtype__ = TYPE; \ diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h index 1882c21b9cd72..b04c46351c2cf 100644 --- a/paddle/phi/kernels/autotune/gpu_timer.h +++ b/paddle/phi/kernels/autotune/gpu_timer.h @@ -32,6 +32,17 @@ namespace phi { +#ifdef PADDLE_WITH_HIP +static void RecordEventTimerCallback(hipStream_t stream, + hipError_t status, + void *user_data) { + struct timeval time_now {}; + gettimeofday(&time_now, nullptr); + double *cpu_time = static_cast(user_data); + *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); + VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time); +} +#else static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream, cudaError_t status, void *user_data) { @@ -41,6 +52,7 @@ static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream, *cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0); VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time); } +#endif class GpuTimer { public: diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h index 1fc2f038aae64..0a89a3ca825ba 100644 --- a/paddle/phi/kernels/funcs/dropout_impl.cu.h +++ b/paddle/phi/kernels/funcs/dropout_impl.cu.h @@ -349,7 +349,19 @@ void DropoutFwGPUKernelDriver( } else { bool copy_in_kernel = GetSeedDataAndIncrement( dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment); - +#ifdef PADDLE_WITH_HIP + VectorizedRandomGenerator + <<>>(0, + size, + seed_data, + dropout_prob, + x_data, + mask_data, + y_data, + upscale_in_train, + increment, + main_offset); +#else void* functionPtr = reinterpret_cast(&(VectorizedRandomGenerator)); cudaFunction_t cudaFunc; @@ -401,6 +413,7 @@ void DropoutFwGPUKernelDriver( VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data << ", increment = " << increment; +#endif } } else { if (upscale_in_train) { diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu index 84a8cc309516b..107759313069d 100644 --- a/paddle/phi/kernels/funcs/fc_functor.cu +++ b/paddle/phi/kernels/funcs/fc_functor.cu @@ -373,6 +373,7 @@ template class FCFunctor; template class FCFunctor; template class FCFunctor; +#ifndef PADDLE_WITH_HIP template void FCInt8Functor::operator()( const DeviceContext& context, @@ -454,5 +455,6 @@ void FCInt8Functor::operator()( template class FCInt8Functor; template class FCInt8Functor; template class FCInt8Functor; +#endif } // namespace funcs } // namespace phi diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu index de277f3d21b2c..0f93e21553a74 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_grad_kernel.cu @@ -11,17 +11,19 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - +#ifndef PADDLE_WITH_HIP #include - #include +#endif #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/backends/gpu/gpu_dnn.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" +#ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" +#endif namespace phi { namespace fusion { @@ -49,6 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel( DenseTensor* bias_grad, DenseTensor* ln_scale_grad, DenseTensor* ln_bias_grad) { +#ifndef PADDLE_WITH_HIP using U = LayerNormParamType; auto* d_y_data = y_grad.data(); auto* ln_scale_data = @@ -111,6 +114,10 @@ void FusedBiasDropoutResidualLnGradKernel( d_x_data, d_bias_data, d_residual_data); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "FusedBiasDropoutResidualLnGradKernel not surpport for rocm")); +#endif } } // namespace fusion diff --git a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu index 78c87a6794096..fd1f754cc9827 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_bias_dropout_residual_layer_norm_kernel.cu @@ -17,7 +17,9 @@ #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/tensor_utils.h" #include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h" +#ifndef PADDLE_WITH_HIP #include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h" +#endif namespace phi { namespace fusion { @@ -40,6 +42,7 @@ void FusedBiasDropoutResidualLnKernel( DenseTensor* dropout_mask_out, DenseTensor* ln_mean, DenseTensor* ln_variance) { +#ifndef PADDLE_WITH_HIP using U = phi::funcs::LayerNormParamType; auto* x_data = x.data(); auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data(); @@ -92,6 +95,10 @@ void FusedBiasDropoutResidualLnKernel( y_data, ln_mean_data, ln_var_data); +#else + PADDLE_THROW(phi::errors::Unimplemented( + "FusedBiasDropoutResidualLnKernel not surpport for rocm")); +#endif } } // namespace fusion } // namespace phi diff --git a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu index 6192b64768088..ac716fa174186 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#ifdef PADDLE_WITH_CUDA #include #include @@ -649,3 +650,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act, // cuda_only float, double, phi::dtype::float16) {} +#endif diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu index 03ae48152a9e4..a3eb1483b4f74 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu @@ -202,6 +202,18 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, ? NoMaskBwFunctor(1.0f - dropout_rate) : NoMaskBwFunctor(1.0f - dropout_rate, 1.0f); +#ifdef PADDLE_WITH_HIP + VectorizedDropoutBackward> + <<>>(0, + numel, + seed_data, // idx: 2 need save + x_grad_data, + y_grad_data, + out_grad_data, + increment, // idx: 6 need save + main_offset, + functor); +#else // we assume seed/offset is same across iterations // seed_offset_data should preserved by cudaGraph pool const phi::GPUContext* dev_ctx_p = &dev_ctx; @@ -239,6 +251,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx, VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data << ", increment = " << increment; +#endif } } diff --git a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu index 5f21b7b05c3d9..17d9698df4d13 100644 --- a/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu @@ -186,6 +186,18 @@ void FusedDropoutAddKernel(const Context& dev_ctx, auto dst_functor = NoMaskFwFunctor(1.0f - dropout_rate, upscale_in_train); +#ifdef PADDLE_WITH_HIP + VectorizedDropoutForward> + <<>>(0, + numel, + seed_data, // need save + x_data, + y_data, + out_data, + increment, // need save + main_offset, + dst_functor); +#else void* functionPtr = reinterpret_cast( &(VectorizedDropoutForward>)); cudaFunction_t cudaFunc; @@ -241,6 +253,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx, VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data << ", increment = " << increment; +#endif } else { using MT = typename phi::dtype::MPTypeTrait::Type; MT factor = static_cast(1.0f - dropout_rate); diff --git a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu index 1bb5c5dfb6301..4671534937a66 100644 --- a/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu +++ b/paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu @@ -85,9 +85,6 @@ PD_REGISTER_KERNEL(skip_layernorm, float, phi::dtype::float16) {} #else -PD_REGISTER_KERNEL(skip_layernorm, - GPU, - ALL_LAYOUT, - phi::fusion::SkipLayerNormKernel, - float){} {} +PD_REGISTER_KERNEL( + skip_layernorm, GPU, ALL_LAYOUT, phi::fusion::SkipLayerNormKernel, float) {} #endif diff --git a/paddle/phi/kernels/gpu/strided_copy_kernel.cu b/paddle/phi/kernels/gpu/strided_copy_kernel.cu index ae173b5f03528..64cd37cd14b63 100644 --- a/paddle/phi/kernels/gpu/strided_copy_kernel.cu +++ b/paddle/phi/kernels/gpu/strided_copy_kernel.cu @@ -32,12 +32,21 @@ __global__ void StridedCopyCaseZeroFunc( phi::Array output_stride) { int64_t input_offset = 0; int64_t output_offset = 0; +#ifdef PADDLE_WITH_HIP + int64_t coordinate[6] = {threadIdx.x, + threadIdx.y, + threadIdx.z, + blockIdx.x, + blockIdx.y, + blockIdx.z}; +#else float coordinate[6] = {threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z}; +#endif #pragma unroll for (int dim = RANK - 1; dim >= 0; --dim) { @@ -458,12 +467,21 @@ __global__ void Strided2ContiguousCaseZeroFunc( blockDim.z * blockDim.y * blockDim.x + threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; +#ifdef PADDLE_WITH_HIP + int64_t coordinate[6] = {threadIdx.x, + threadIdx.y, + threadIdx.z, + blockIdx.x, + blockIdx.y, + blockIdx.z}; +#else float coordinate[6] = {threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z}; +#endif #pragma unroll for (int dim = RANK - 1; dim >= 0; --dim) { @@ -863,12 +881,21 @@ __global__ void Contiguous2StridedCaseZeroFunc( threadIdx.z * blockDim.y * blockDim.x + threadIdx.y * blockDim.x + threadIdx.x; int64_t output_offset = 0; +#ifdef PADDLE_WITH_HIP + int64_t coordinate[6] = {threadIdx.x, + threadIdx.y, + threadIdx.z, + blockIdx.x, + blockIdx.y, + blockIdx.z}; +#else float coordinate[6] = {threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, blockIdx.y, blockIdx.z}; +#endif #pragma unroll for (int dim = RANK - 1; dim >= 0; --dim) { diff --git a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu index a78040eb6a669..91aa8e63246b3 100644 --- a/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu +++ b/paddle/phi/kernels/gpu/top_p_sampling_kernel.cu @@ -14,10 +14,18 @@ #include "paddle/phi/kernels/top_p_sampling_kernel.h" +#ifdef PADDLE_WITH_HIP +#include +#include +#include +#include +namespace cub = hipcub; +#else #include #include +#include +#endif -#include "cub/cub.cuh" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_device_function.h" #include "paddle/phi/core/kernel_registry.h" @@ -49,6 +57,16 @@ struct DataTypeTraits { __VA_ARGS__; \ } break +#ifdef PADDLE_WITH_HIP +#define WARP_SIZE 64 +#define FIXED_BLOCK_DIM(...) \ + FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); +#else +#define WARP_SIZE 32 #define FIXED_BLOCK_DIM(...) \ FIXED_BLOCK_DIM_BASE(1024, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(512, ##__VA_ARGS__); \ @@ -56,6 +74,7 @@ struct DataTypeTraits { FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) +#endif struct SegmentOffsetIter { explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} @@ -107,6 +126,16 @@ struct Pair { inline int div_up(int a, int n) { return (a + n - 1) / n; } +#ifdef PADDLE_WITH_HIP +__global__ void setup_kernel(hiprandState_t* state, + const uint64_t seed, + const int bs) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = idx; i < bs; i += gridDim.x * blockDim.x) { + hiprand_init(seed, i, 0, &state[i]); + } +} +#else __global__ void setup_kernel(curandState_t* state, const uint64_t seed, const int bs) { @@ -115,6 +144,7 @@ __global__ void setup_kernel(curandState_t* state, curand_init(seed, i, 0, &state[i]); } } +#endif template __device__ __forceinline__ void AddTo(Pair topk[], @@ -199,7 +229,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair topk[], template __forceinline__ __device__ Pair WarpReduce(Pair input) { #pragma unroll - for (int offset = 16; offset > 0; offset >>= 1) { + for (int offset = WARP_SIZE / 2; offset > 0; offset >>= 1) { T tmp_val = phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset); int tmp_id = @@ -231,7 +261,7 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], shared_max[wid] = input_now; } __syncthreads(); - input_now = (tid < BlockSize / 32) + input_now = (tid < BlockSize / WARP_SIZE) ? shared_max[lane] : Pair(std::numeric_limits::min(), -1); if (wid == 0) { @@ -259,12 +289,22 @@ __device__ __forceinline__ void BlockReduce(Pair shared_max[], if (MaxLength < 5) { if (*beam >= MaxLength) break; } else { +#ifdef PADDLE_WITH_HIP + uint64 mask = 0; + mask = __ballot(true); + if (tid_max / WARP_SIZE == wid) { + if (__shfl_down(*beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength) + break; + } +#else unsigned mask = 0u; mask = __ballot_sync(FINAL_MASK, true); - if (tid_max / 32 == wid) { - if (__shfl_down_sync(FINAL_MASK, *beam, tid_max % 32, 32) == MaxLength) + if (tid_max / WARP_SIZE == wid) { + if (__shfl_down_sync( + FINAL_MASK, *beam, tid_max % WARP_SIZE, WARP_SIZE) == MaxLength) break; } +#endif } } } @@ -276,12 +316,16 @@ __global__ void KeMatrixTopPBeamTopK(const T* src, int64_t* out_id, // topk id T* out_val, // topk val int vocab_size, +#ifdef PADDLE_WITH_HIP + hiprandState_t* state, +#else curandState_t* state, +#endif int* count_iter, int* count_iter_begin) { const int tid = threadIdx.x; - const int wid = tid / 32; - const int lane = tid % 32; + const int wid = tid / WARP_SIZE; + const int lane = tid % WARP_SIZE; const int bid = blockIdx.x; const float threshold_now = threshold ? static_cast(threshold[bid]) : 0.f; @@ -289,7 +333,7 @@ __global__ void KeMatrixTopPBeamTopK(const T* src, int top_num = TopPBeamTopK; float top_p_num = static_cast(top_ps[bid]); - __shared__ Pair shared_max[BlockSize / 32]; + __shared__ Pair shared_max[BlockSize / WARP_SIZE]; __shared__ Pair beam_max[TopPBeamTopK]; Pair topk[MaxLength]; @@ -322,7 +366,11 @@ __global__ void KeMatrixTopPBeamTopK(const T* src, } if (tid == 0) { count_iter_begin[bid] = count_iter[bid]; +#ifdef PADDLE_WITH_HIP + float rand_top_p = hiprand_uniform(state + bid) * top_p_num; +#else float rand_top_p = curand_uniform(state + bid) * top_p_num; +#endif top_ps[bid] = (T)rand_top_p; float sum_prob = 0.0f; @@ -421,9 +469,9 @@ __global__ void topp_sampling(T* sorted_probs, __shared__ float rand_p; const int tid = threadIdx.x; const int bid = blockIdx.x; - constexpr int NUM_WARPS = BLOCK_SIZE / 32; - const int lane_id = tid % 32; - const int warp_id = tid / 32; + constexpr int NUM_WARPS = BLOCK_SIZE / WARP_SIZE; + const int lane_id = tid % WARP_SIZE; + const int warp_id = tid / WARP_SIZE; const float p_t = static_cast(top_ps[bid]); const float threshold_now = threshold ? static_cast(threshold[bid]) : 0.f; @@ -444,7 +492,11 @@ __global__ void topp_sampling(T* sorted_probs, typedef cub::BlockReduce BlockReduce; __shared__ typename BlockScan::TempStorage temp_storage; __shared__ typename BlockReduce::TempStorage temp_storage_reduce; +#ifdef PADDLE_WITH_HIP + __shared__ uint64_t selected_shared[NUM_WARPS]; +#else __shared__ uint32_t selected_shared[NUM_WARPS]; +#endif int threshold_id = 0; // Initialize running total @@ -468,7 +520,11 @@ __global__ void topp_sampling(T* sorted_probs, BlockScan(temp_storage) .InclusiveSum(thread_count, thread_offset, prefix_op); +#ifdef PADDLE_WITH_HIP + uint64_t activate_mask = __ballot(rand_p <= thread_offset); +#else uint32_t activate_mask = __ballot_sync(FINAL_MASK, rand_p <= thread_offset); +#endif i_activate = i; if (activate_mask != 0) { @@ -506,9 +562,15 @@ __global__ void topp_sampling(T* sorted_probs, // don't sample low score token int max_id = BlockReduce(temp_storage_reduce).Reduce(threshold_id, MaxOp()); +#ifdef PADDLE_WITH_HIP + hiprandStatePhilox4_32_10_t rng; + hiprand_init(seed, tid, 0, &rng); + int random_id = hiprand(&rng) % (max_id + 1); +#else curandStatePhilox4_32_10_t rng; curand_init(seed, tid, 0, &rng); int random_id = curand(&rng) % (max_id + 1); +#endif out_id[bid] = sorted_id[offset + random_id]; out_val[bid] = sorted_probs[offset + random_id]; } else { @@ -537,6 +599,19 @@ __global__ void set_sorted_num(int* need_sorted_num, int bs) { *need_sorted_num = bs; } +#ifdef PADDLE_WITH_HIP +template +__global__ void print_kernel(T* input, int size) { + for (int i = 0; i < size; i++) { + printf("["); + if (i != size - 1) { + printf("%f, ", static_cast(input[i])); + } else { + printf("%f]\n", static_cast(input[i])); + } + } +} +#else template __global__ void print_kernel(T* input, int size) { for (int i = 0; i < size; i++) { @@ -550,6 +625,7 @@ __global__ void print_kernel(T* input, int size) { VLOG(0) << ss.str(); } } +#endif template T* SafeGetTensorPtr(const DenseTensor& t) { @@ -612,6 +688,16 @@ void TopPSamplingKernel(const Context& dev_ctx, PD_THROW("the input data shape has error in the FillIndex kernel."); } +#ifdef PADDLE_WITH_HIP + hiprandState_t* dev_curand_states; + phi::Allocator::AllocationPtr curand_states_buf{nullptr}; + curand_states_buf = phi::memory_utils::Alloc( + dev_ctx.GetPlace(), + bs * sizeof(hiprandState_t), + phi::Stream(reinterpret_cast(dev_ctx.stream()))); + dev_curand_states = + reinterpret_cast(curand_states_buf->ptr()); +#else curandState_t* dev_curand_states; phi::Allocator::AllocationPtr curand_states_buf{nullptr}; curand_states_buf = phi::memory_utils::Alloc( @@ -620,6 +706,7 @@ void TopPSamplingKernel(const Context& dev_ctx, phi::Stream(reinterpret_cast(dev_ctx.stream()))); dev_curand_states = reinterpret_cast(curand_states_buf->ptr()); +#endif uint64_t seed; if (random_seed == -1) { seed = static_cast(time(NULL) % 1000000); diff --git a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h index f48e871dce165..1948c928733e6 100644 --- a/paddle/phi/kernels/impl/quant_linear_kernel_impl.h +++ b/paddle/phi/kernels/impl/quant_linear_kernel_impl.h @@ -76,6 +76,10 @@ void QuantLinearKernel(const Context& dev_ctx, "The weight's datatype is expected to be int8 when use quant. But " "received weight's datatype is %d", static_cast(w.dtype()))); +#ifdef PADDLE_WITH_HIP + PADDLE_THROW( + phi::errors::Unimplemented("FCInt8Functor not surpport for rocm")); +#else phi::funcs::FCInt8Functor fc; fc(dev_ctx, M, @@ -92,6 +96,7 @@ void QuantLinearKernel(const Context& dev_ctx, bias_data, with_relu, padding_weights); +#endif return; } diff --git a/test/cpp/auto_parallel/CMakeLists.txt b/test/cpp/auto_parallel/CMakeLists.txt index 7c2a329172107..c736ad85007b6 100644 --- a/test/cpp/auto_parallel/CMakeLists.txt +++ b/test/cpp/auto_parallel/CMakeLists.txt @@ -35,6 +35,7 @@ if(WITH_DISTRIBUTE) endif() - -cc_test(dist_mapper_test SRCS dist_mapper_test.cc DEPS phi) - +cc_test( + dist_mapper_test + SRCS dist_mapper_test.cc + DEPS phi) diff --git a/test/cpp/pir/control_flow_dialect/CMakeLists.txt b/test/cpp/pir/control_flow_dialect/CMakeLists.txt index 9309a8a774b29..fe751953c8fc4 100644 --- a/test/cpp/pir/control_flow_dialect/CMakeLists.txt +++ b/test/cpp/pir/control_flow_dialect/CMakeLists.txt @@ -1,21 +1,23 @@ -cc_test_old( - if_op_test - SRCS - if_op_test.cc - DEPS - pir - op_dialect_vjp - pir_transforms - op_dialect - gtest) +if(NOT WITH_ROCM) + cc_test_old( + if_op_test + SRCS + if_op_test.cc + DEPS + pir + op_dialect_vjp + pir_transforms + op_dialect + gtest) -cc_test_old( - while_op_test - SRCS - while_op_test.cc - DEPS - pir - op_dialect_vjp - pir_transforms - op_dialect - gtest) + cc_test_old( + while_op_test + SRCS + while_op_test.cc + DEPS + pir + op_dialect_vjp + pir_transforms + op_dialect + gtest) +endif()