Skip to content

Commit

Permalink
fix rocm compile bugs (#59775)
Browse files Browse the repository at this point in the history
* fix rocm compile bugs

* pre-commit fix

* fix rocm compile bugs
  • Loading branch information
yuguo-Jack authored Dec 8, 2023
1 parent 6e09e44 commit e603865
Show file tree
Hide file tree
Showing 18 changed files with 252 additions and 49 deletions.
8 changes: 4 additions & 4 deletions paddle/fluid/framework/new_executor/program_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,

PrepareForCUDAGraphCapture();

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
#endif
}
Expand Down Expand Up @@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {

std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
double start_time = 0, end_time = 0;
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
start_time = calculate_stream_timer_->StartTime();
end_time = calculate_stream_timer_->EndTime();
#endif
Expand Down Expand Up @@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {

try {
instr_node.WaitEvent(place_);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
!interpreter::IsCommunicationOp(instr_node)) {
Expand All @@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
}

instr_node.RecordEvent(place_);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
if (instr_node.Id() == last_calculate_instr_id_ &&
calculate_stream_timer_->IsStarted()) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/program_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/phi/kernels/autotune/gpu_timer.h"
#endif

Expand Down
17 changes: 16 additions & 1 deletion paddle/phi/core/distributed/nccl_comm_task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ void NCCLCommTask::EndRecord() {
#endif
}

#ifdef PADDLE_WITH_CUDA
void NCCLCommTask::ClearRecord() {
if (start_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
Expand All @@ -102,8 +103,22 @@ void NCCLCommTask::ClearRecord() {
end_event_created_ = false;
}
}
#else // PADDLE_WITH_HIP
void NCCLCommTask::ClearRecord() {
if (start_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
HIP_CHECK(hipEventDestroy(nccl_start_event_));
start_event_created_ = false;
}
if (end_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
HIP_CHECK(hipEventDestroy(nccl_end_event_));
end_event_created_ = false;
}
}
#endif

bool NCCLCommTask::CudaEventQuery(cudaEvent_t event) {
bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
#ifdef PADDLE_WITH_CUDA
cudaError_t ret = cudaEventQuery(event);
if (ret == cudaSuccess) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/core/visit_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ namespace phi {

///////// BOOL and Floating and Integral Dispatch Marco ///////////

#if NCCL_VERSION_CODE >= 21000
#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \
[&] { \
const auto& __dtype__ = TYPE; \
Expand Down
12 changes: 12 additions & 0 deletions paddle/phi/kernels/autotune/gpu_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@

namespace phi {

#ifdef PADDLE_WITH_HIP
static void RecordEventTimerCallback(hipStream_t stream,
hipError_t status,
void *user_data) {
struct timeval time_now {};
gettimeofday(&time_now, nullptr);
double *cpu_time = static_cast<double *>(user_data);
*cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
}
#else
static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
cudaError_t status,
void *user_data) {
Expand All @@ -41,6 +52,7 @@ static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
*cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
}
#endif

class GpuTimer {
public:
Expand Down
15 changes: 14 additions & 1 deletion paddle/phi/kernels/funcs/dropout_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,19 @@ void DropoutFwGPUKernelDriver(
} else {
bool copy_in_kernel = GetSeedDataAndIncrement(
dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);

#ifdef PADDLE_WITH_HIP
VectorizedRandomGenerator<T>
<<<grid_size, block_size, 0, stream>>>(0,
size,
seed_data,
dropout_prob,
x_data,
mask_data,
y_data,
upscale_in_train,
increment,
main_offset);
#else
void* functionPtr =
reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
cudaFunction_t cudaFunc;
Expand Down Expand Up @@ -401,6 +413,7 @@ void DropoutFwGPUKernelDriver(

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
}
} else {
if (upscale_in_train) {
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/funcs/fc_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ template class FCFunctor<GPUContext, float16>;
template class FCFunctor<GPUContext, float>;
template class FCFunctor<GPUContext, double>;

#ifndef PADDLE_WITH_HIP
template <typename DeviceContext, typename T>
void FCInt8Functor<DeviceContext, T>::operator()(
const DeviceContext& context,
Expand Down Expand Up @@ -454,5 +455,6 @@ void FCInt8Functor<DeviceContext, T>::operator()(
template class FCInt8Functor<GPUContext, float16>;
template class FCInt8Functor<GPUContext, float>;
template class FCInt8Functor<GPUContext, double>;
#endif
} // namespace funcs
} // namespace phi
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PADDLE_WITH_HIP
#include <cuda_fp16.h>

#include <cub/cub.cuh>
#endif

#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
#endif

namespace phi {
namespace fusion {
Expand Down Expand Up @@ -49,6 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
DenseTensor* bias_grad,
DenseTensor* ln_scale_grad,
DenseTensor* ln_bias_grad) {
#ifndef PADDLE_WITH_HIP
using U = LayerNormParamType<T>;
auto* d_y_data = y_grad.data<T>();
auto* ln_scale_data =
Expand Down Expand Up @@ -111,6 +114,10 @@ void FusedBiasDropoutResidualLnGradKernel(
d_x_data,
d_bias_data,
d_residual_data);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
#endif
}

} // namespace fusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
#endif

namespace phi {
namespace fusion {
Expand All @@ -40,6 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
DenseTensor* dropout_mask_out,
DenseTensor* ln_mean,
DenseTensor* ln_variance) {
#ifndef PADDLE_WITH_HIP
using U = phi::funcs::LayerNormParamType<T>;
auto* x_data = x.data<T>();
auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
Expand Down Expand Up @@ -92,6 +95,10 @@ void FusedBiasDropoutResidualLnKernel(
y_data,
ln_mean_data,
ln_var_data);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"FusedBiasDropoutResidualLnKernel not surpport for rocm"));
#endif
}
} // namespace fusion
} // namespace phi
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef PADDLE_WITH_CUDA
#include <xxhash.h>

#include <algorithm>
Expand Down Expand Up @@ -649,3 +650,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act, // cuda_only
float,
double,
phi::dtype::float16) {}
#endif
13 changes: 13 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,18 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
: NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);

#ifdef PADDLE_WITH_HIP
VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
<<<grid_size, block_size, 0, stream>>>(0,
numel,
seed_data, // idx: 2 need save
x_grad_data,
y_grad_data,
out_grad_data,
increment, // idx: 6 need save
main_offset,
functor);
#else
// we assume seed/offset is same across iterations
// seed_offset_data should preserved by cudaGraph pool
const phi::GPUContext* dev_ctx_p = &dev_ctx;
Expand Down Expand Up @@ -239,6 +251,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
}
}

Expand Down
13 changes: 13 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,18 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
auto dst_functor =
NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);

#ifdef PADDLE_WITH_HIP
VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
<<<grid_size, block_size, 0, stream>>>(0,
numel,
seed_data, // need save
x_data,
y_data,
out_data,
increment, // need save
main_offset,
dst_functor);
#else
void* functionPtr = reinterpret_cast<void*>(
&(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
cudaFunction_t cudaFunc;
Expand Down Expand Up @@ -241,6 +253,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
} else {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
MT factor = static_cast<MT>(1.0f - dropout_rate);
Expand Down
7 changes: 2 additions & 5 deletions paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ PD_REGISTER_KERNEL(skip_layernorm,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(skip_layernorm,
GPU,
ALL_LAYOUT,
phi::fusion::SkipLayerNormKernel,
float){} {}
PD_REGISTER_KERNEL(
skip_layernorm, GPU, ALL_LAYOUT, phi::fusion::SkipLayerNormKernel, float) {}
#endif
27 changes: 27 additions & 0 deletions paddle/phi/kernels/gpu/strided_copy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,21 @@ __global__ void StridedCopyCaseZeroFunc(
phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
int64_t input_offset = 0;
int64_t output_offset = 0;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down Expand Up @@ -458,12 +467,21 @@ __global__ void Strided2ContiguousCaseZeroFunc(
blockDim.z * blockDim.y * blockDim.x +
threadIdx.z * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down Expand Up @@ -863,12 +881,21 @@ __global__ void Contiguous2StridedCaseZeroFunc(
threadIdx.z * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x;
int64_t output_offset = 0;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down
Loading

0 comments on commit e603865

Please sign in to comment.