Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ROCM] fix rocm compile bugs #59775

Merged
merged 5 commits into from
Dec 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions paddle/fluid/framework/new_executor/program_interpreter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ ProgramInterpreter::ProgramInterpreter(const platform::Place& place,

PrepareForCUDAGraphCapture();

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
calculate_stream_timer_ = std::make_unique<phi::CalculateStreamTimer>(place);
#endif
}
Expand Down Expand Up @@ -659,7 +659,7 @@ void ProgramInterpreter::ClearLoDTensorArrayInLocalScope() {

std::tuple<double, double> ProgramInterpreter::InterpreterRunTime() {
double start_time = 0, end_time = 0;
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
start_time = calculate_stream_timer_->StartTime();
end_time = calculate_stream_timer_->EndTime();
#endif
Expand Down Expand Up @@ -1105,7 +1105,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {

try {
instr_node.WaitEvent(place_);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
if (!calculate_stream_timer_->IsStarted() && op->Type() != "feed" &&
!interpreter::IsCommunicationOp(instr_node)) {
Expand All @@ -1124,7 +1124,7 @@ void ProgramInterpreter::RunInstruction(const Instruction& instr_node) {
}

instr_node.RecordEvent(place_);
#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
if (enable_job_schedule_profiler_) {
if (instr_node.Id() == last_calculate_instr_id_ &&
calculate_stream_timer_->IsStarted()) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/framework/new_executor/program_interpreter.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@

#include "paddle/fluid/framework/new_executor/interpreter_base_impl.h"

#if defined(PADDLE_WITH_CUDA)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/phi/kernels/autotune/gpu_timer.h"
#endif

Expand Down
17 changes: 16 additions & 1 deletion paddle/phi/core/distributed/nccl_comm_task.cc
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ void NCCLCommTask::EndRecord() {
#endif
}

#ifdef PADDLE_WITH_CUDA
void NCCLCommTask::ClearRecord() {
if (start_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
Expand All @@ -102,8 +103,22 @@ void NCCLCommTask::ClearRecord() {
end_event_created_ = false;
}
}
#else // PADDLE_WITH_HIP
void NCCLCommTask::ClearRecord() {
if (start_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
HIP_CHECK(hipEventDestroy(nccl_start_event_));
start_event_created_ = false;
}
if (end_event_created_) {
backends::gpu::GPUDeviceGuard guard(place_.device);
HIP_CHECK(hipEventDestroy(nccl_end_event_));
end_event_created_ = false;
}
}
#endif

bool NCCLCommTask::CudaEventQuery(cudaEvent_t event) {
bool NCCLCommTask::CudaEventQuery(gpuEvent_t event) {
#ifdef PADDLE_WITH_CUDA
cudaError_t ret = cudaEventQuery(event);
if (ret == cudaSuccess) {
Expand Down
2 changes: 1 addition & 1 deletion paddle/phi/core/visit_type.h
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ namespace phi {

///////// BOOL and Floating and Integral Dispatch Marco ///////////

#if NCCL_VERSION_CODE >= 21000
#if (NCCL_VERSION_CODE >= 21000) && !defined(PADDLE_WITH_RCCL)
#define PD_VISIT_BOOL_AND_FLOATING_AND_INTEGRAL_TYPES_GPU(TYPE, NAME, ...) \
[&] { \
const auto& __dtype__ = TYPE; \
Expand Down
12 changes: 12 additions & 0 deletions paddle/phi/kernels/autotune/gpu_timer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,17 @@

namespace phi {

#ifdef PADDLE_WITH_HIP
static void RecordEventTimerCallback(hipStream_t stream,
hipError_t status,
void *user_data) {
struct timeval time_now {};
gettimeofday(&time_now, nullptr);
double *cpu_time = static_cast<double *>(user_data);
*cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
}
#else
static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
cudaError_t status,
void *user_data) {
Expand All @@ -41,6 +52,7 @@ static void CUDART_CB RecordEventTimerCallback(cudaStream_t stream,
*cpu_time = (time_now.tv_sec * 1000) + (time_now.tv_usec / 1000.0);
VLOG(3) << "RecordEventCallback: " << std::to_string(*cpu_time);
}
#endif

class GpuTimer {
public:
Expand Down
15 changes: 14 additions & 1 deletion paddle/phi/kernels/funcs/dropout_impl.cu.h
Original file line number Diff line number Diff line change
Expand Up @@ -349,7 +349,19 @@ void DropoutFwGPUKernelDriver(
} else {
bool copy_in_kernel = GetSeedDataAndIncrement(
dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);

#ifdef PADDLE_WITH_HIP
VectorizedRandomGenerator<T>
<<<grid_size, block_size, 0, stream>>>(0,
size,
seed_data,
dropout_prob,
x_data,
mask_data,
y_data,
upscale_in_train,
increment,
main_offset);
#else
void* functionPtr =
reinterpret_cast<void*>(&(VectorizedRandomGenerator<T>));
cudaFunction_t cudaFunc;
Expand Down Expand Up @@ -401,6 +413,7 @@ void DropoutFwGPUKernelDriver(

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
}
} else {
if (upscale_in_train) {
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/funcs/fc_functor.cu
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ template class FCFunctor<GPUContext, float16>;
template class FCFunctor<GPUContext, float>;
template class FCFunctor<GPUContext, double>;

#ifndef PADDLE_WITH_HIP
template <typename DeviceContext, typename T>
void FCInt8Functor<DeviceContext, T>::operator()(
const DeviceContext& context,
Expand Down Expand Up @@ -454,5 +455,6 @@ void FCInt8Functor<DeviceContext, T>::operator()(
template class FCInt8Functor<GPUContext, float16>;
template class FCInt8Functor<GPUContext, float>;
template class FCInt8Functor<GPUContext, double>;
#endif
} // namespace funcs
} // namespace phi
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,19 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef PADDLE_WITH_HIP
#include <cuda_fp16.h>

#include <cub/cub.cuh>
#endif

#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
#endif

namespace phi {
namespace fusion {
Expand Down Expand Up @@ -49,6 +51,7 @@ void FusedBiasDropoutResidualLnGradKernel(
DenseTensor* bias_grad,
DenseTensor* ln_scale_grad,
DenseTensor* ln_bias_grad) {
#ifndef PADDLE_WITH_HIP
using U = LayerNormParamType<T>;
auto* d_y_data = y_grad.data<T>();
auto* ln_scale_data =
Expand Down Expand Up @@ -111,6 +114,10 @@ void FusedBiasDropoutResidualLnGradKernel(
d_x_data,
d_bias_data,
d_residual_data);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"FusedBiasDropoutResidualLnGradKernel not surpport for rocm"));
#endif
}

} // namespace fusion
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,9 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/layer_norm_impl.cu.h"
#ifndef PADDLE_WITH_HIP
#include "paddle/phi/kernels/fusion/gpu/fused_dropout_helper.h"
#endif

namespace phi {
namespace fusion {
Expand All @@ -40,6 +42,7 @@ void FusedBiasDropoutResidualLnKernel(
DenseTensor* dropout_mask_out,
DenseTensor* ln_mean,
DenseTensor* ln_variance) {
#ifndef PADDLE_WITH_HIP
using U = phi::funcs::LayerNormParamType<T>;
auto* x_data = x.data<T>();
auto* bias_data = (bias.get_ptr() == nullptr) ? nullptr : bias->data<T>();
Expand Down Expand Up @@ -92,6 +95,10 @@ void FusedBiasDropoutResidualLnKernel(
y_data,
ln_mean_data,
ln_var_data);
#else
PADDLE_THROW(phi::errors::Unimplemented(
"FusedBiasDropoutResidualLnKernel not surpport for rocm"));
#endif
}
} // namespace fusion
} // namespace phi
Expand Down
2 changes: 2 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_conv2d_add_act.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#ifdef PADDLE_WITH_CUDA
#include <xxhash.h>

#include <algorithm>
Expand Down Expand Up @@ -649,3 +650,4 @@ PD_REGISTER_KERNEL(fused_conv2d_add_act, // cuda_only
float,
double,
phi::dtype::float16) {}
#endif
13 changes: 13 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_add_grad_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,18 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,
? NoMaskBwFunctor<T, float>(1.0f - dropout_rate)
: NoMaskBwFunctor<T, float>(1.0f - dropout_rate, 1.0f);

#ifdef PADDLE_WITH_HIP
VectorizedDropoutBackward<T, NoMaskBwFunctor<T, float>>
<<<grid_size, block_size, 0, stream>>>(0,
numel,
seed_data, // idx: 2 need save
x_grad_data,
y_grad_data,
out_grad_data,
increment, // idx: 6 need save
main_offset,
functor);
#else
// we assume seed/offset is same across iterations
// seed_offset_data should preserved by cudaGraph pool
const phi::GPUContext* dev_ctx_p = &dev_ctx;
Expand Down Expand Up @@ -239,6 +251,7 @@ void FusedDropoutAddGradKernel(const Context& dev_ctx,

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
}
}

Expand Down
13 changes: 13 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_dropout_add_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,18 @@ void FusedDropoutAddKernel(const Context& dev_ctx,
auto dst_functor =
NoMaskFwFunctor<T, float>(1.0f - dropout_rate, upscale_in_train);

#ifdef PADDLE_WITH_HIP
VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>
<<<grid_size, block_size, 0, stream>>>(0,
numel,
seed_data, // need save
x_data,
y_data,
out_data,
increment, // need save
main_offset,
dst_functor);
#else
void* functionPtr = reinterpret_cast<void*>(
&(VectorizedDropoutForward<T, NoMaskFwFunctor<T, float>>));
cudaFunction_t cudaFunc;
Expand Down Expand Up @@ -241,6 +253,7 @@ void FusedDropoutAddKernel(const Context& dev_ctx,

VLOG(10) << "NON_CUDA_GRAPH seed = " << seed_data
<< ", increment = " << increment;
#endif
} else {
using MT = typename phi::dtype::MPTypeTrait<T>::Type;
MT factor = static_cast<MT>(1.0f - dropout_rate);
Expand Down
7 changes: 2 additions & 5 deletions paddle/phi/kernels/fusion/gpu/skip_layernorm_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,9 +85,6 @@ PD_REGISTER_KERNEL(skip_layernorm,
float,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(skip_layernorm,
GPU,
ALL_LAYOUT,
phi::fusion::SkipLayerNormKernel,
float){} {}
PD_REGISTER_KERNEL(
skip_layernorm, GPU, ALL_LAYOUT, phi::fusion::SkipLayerNormKernel, float) {}
#endif
27 changes: 27 additions & 0 deletions paddle/phi/kernels/gpu/strided_copy_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,21 @@ __global__ void StridedCopyCaseZeroFunc(
phi::Array<int64_t, phi::DDim::kMaxRank + 1> output_stride) {
int64_t input_offset = 0;
int64_t output_offset = 0;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down Expand Up @@ -458,12 +467,21 @@ __global__ void Strided2ContiguousCaseZeroFunc(
blockDim.z * blockDim.y * blockDim.x +
threadIdx.z * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down Expand Up @@ -863,12 +881,21 @@ __global__ void Contiguous2StridedCaseZeroFunc(
threadIdx.z * blockDim.y * blockDim.x +
threadIdx.y * blockDim.x + threadIdx.x;
int64_t output_offset = 0;
#ifdef PADDLE_WITH_HIP
int64_t coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#else
float coordinate[6] = {threadIdx.x,
threadIdx.y,
threadIdx.z,
blockIdx.x,
blockIdx.y,
blockIdx.z};
#endif

#pragma unroll
for (int dim = RANK - 1; dim >= 0; --dim) {
Expand Down
Loading