Skip to content

Commit

Permalink
Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP (PaddlePaddle#40560)
Browse files Browse the repository at this point in the history
* Replace PADDLE_WITH_XPU2 with PADDLE_WITH_KP
  • Loading branch information
AnnaTrainingG authored and liqitong-a committed Mar 17, 2022
1 parent dfa7ee5 commit c1c4288
Show file tree
Hide file tree
Showing 3 changed files with 75 additions and 135 deletions.
192 changes: 57 additions & 135 deletions paddle/phi/kernels/funcs/reduce_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@

#pragma once

// CUDA and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
// CUDA, XPU and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || defined(__xpu__)

#include <algorithm>
#include <cmath>
Expand Down Expand Up @@ -220,7 +220,7 @@ struct IndexCalculator {
phi::Array<int, kMaxRank> dims;
phi::Array<int, kMaxRank> strides;
phi::Array<int, kMaxRank> reduce_strides;
#ifndef PADDLE_WITH_XPU2
#ifndef PADDLE_WITH_XPU_KP
phi::Array<paddle::platform::FastDivMod, kMaxRank> divmoders;
#endif
};
Expand All @@ -231,81 +231,65 @@ struct ReduceIndexMapping {
HOSTDEVICE explicit ReduceIndexMapping(const kps::DimConfig& dims)
: dim(dims) {}

#ifdef PADDLE_WITH_XPU_KP
__device__ __forceinline__ int BlockIdX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
} else {
return cluster_id() % dim.split_num_x;
}
#else
return blockIdx.x;
#endif
}

__device__ __forceinline__ int BlockIdY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return (cluster_id() % dim.split_num_x);
} else {
return (cluster_id() / dim.split_num_x % dim.split_num_y);
}
#else
return blockIdx.y;
#endif
}

__device__ __forceinline__ int BlockDimX() {
#ifdef PADDLE_WITH_XPU2
return dim.deal_size_x;
#else
return blockDim.x;
#endif
}
__device__ __forceinline__ int BlockDimX() { return dim.deal_size_x; }

__device__ __forceinline__ int BlockDimY() {
#ifdef PADDLE_WITH_XPU2
return 1;
#else
return blockDim.y;
#endif
}
__device__ __forceinline__ int BlockDimY() { return 1; }

__device__ __forceinline__ int GridDimX() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_y;
} else {
return dim.split_num_x;
}
#else
return gridDim.x;
#endif
}

__device__ __forceinline__ int GridDimY() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.split_num_x;
} else {
return dim.split_num_y;
}
#else
return gridDim.y;
#endif
}

__device__ __forceinline__ int GetLoopSize() {
#ifdef PADDLE_WITH_XPU2
if (ReduceLastDim) {
return dim.deal_size_y;
} else {
return dim.deal_size_x;
}
}
#else
return 1;
__device__ __forceinline__ int BlockIdX() { return blockIdx.x; }

__device__ __forceinline__ int BlockIdY() { return blockIdx.y; }

__device__ __forceinline__ int BlockDimX() { return blockDim.x; }

__device__ __forceinline__ int BlockDimY() { return blockDim.y; }

__device__ __forceinline__ int GridDimX() { return gridDim.x; }

__device__ __forceinline__ int GridDimY() { return gridDim.y; }

__device__ int GetLoopSize() { return 1; }
#endif
}
};

// when reduce_type == kReduceLastDim this struct will be used
Expand Down Expand Up @@ -341,7 +325,7 @@ struct ReduceConfig {

// when should_reduce_again is true, we need malloc temp space for temp data
void SetOutputData(Ty* y_data,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
phi::DenseTensor* tmp) {
if (should_reduce_again) {
tmp->Resize(phi::make_ddim(
Expand Down Expand Up @@ -640,9 +624,7 @@ struct ReduceConfig {
int blocking_size;
bool should_reduce_again;
bool reduce_last_dim;

Ty* output_data;

dim3 block;
dim3 grid;
};
Expand Down Expand Up @@ -770,9 +752,10 @@ __global__ void ReduceAnyKernel(const Tx* x,

kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim);
if (need_store) {
y[store_offset + i] = static_cast<Ty>(reduce_var);
}

Ty result = static_cast<Ty>(reduce_var);
kps::details::WriteData<Ty>(
y + store_offset + i, &result, static_cast<int>(need_store));
}
}

Expand Down Expand Up @@ -882,30 +865,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
OneDimIndexCal><<<config.grid, config.block, 0, stream>>>(
OneDimIndexCal><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -917,7 +888,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator,
left_index_calculator,
dim);
#endif

} else {
int reduce_rank = config.reduce_strides.size();
Expand All @@ -938,30 +908,18 @@ static void LaunchReduceKernel(const Tx* x_data,
dim.SetRem(config.reduce_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<8, 64, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
init,
config.reduce_num,
config.left_num,
config.reduce_last_dim,
reduce_index_calculator,
left_index_calculator,
dim);
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceAnyKernel<Tx,
Ty,
MPType,
ReduceOp,
TransformOp,
IndexCalculator><<<config.grid, config.block, 0, stream>>>(
IndexCalculator><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -973,7 +931,6 @@ static void LaunchReduceKernel(const Tx* x_data,
reduce_index_calculator,
left_index_calculator,
dim);
#endif
}

if (config.should_reduce_again) {
Expand All @@ -993,22 +950,9 @@ static void LaunchReduceKernel(const Tx* x_data,
kps::DimConfig(grid.x, grid.y, grid.z, block.x, config.grid.y, 0);
dim.SetRem(config.left_num % block.x, 0, 0);
#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(),
init,
config.grid.y,
config.left_num,
config.grid.y,
dim);
#else
grid = 8;
block = 64;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
Expand All @@ -1024,7 +968,6 @@ static void LaunchReduceKernel(const Tx* x_data,
config.left_num,
config.grid.y,
dim);
#endif
}
}

Expand All @@ -1038,7 +981,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
KPStream stream) {
auto reducer = ReduceOp<Ty>();
cub::TransformInputIterator<Ty, TransformOp, const Tx*> trans_x(x_data,
Expand Down Expand Up @@ -1077,7 +1020,7 @@ CubTensorReduceImpl(const Tx* x_data,
Ty* y_data,
const TransformOp& transform,
int reduce_num,
const phi::GPUContext& dev_ctx,
const KPDevice& dev_ctx,
KPStream stream) {
PADDLE_THROW(phi::errors::InvalidArgument(
"Tx should not be float16 when using cub::DeviceReduce::Reduce()."));
Expand All @@ -1087,12 +1030,16 @@ template <typename Tx,
typename Ty,
template <typename> class ReduceOp,
typename TransformOp>
void ReduceKernel(const phi::GPUContext& dev_ctx,
void ReduceKernel(const KPDevice& dev_ctx,
const phi::DenseTensor& x,
phi::DenseTensor* y,
const TransformOp& transform,
const std::vector<int>& origin_reduce_dims) {
#ifdef PADDLE_WITH_XPU_KP
auto stream = dev_ctx.x_context()->xpu_stream;
#else
auto stream = dev_ctx.stream();
#endif
dev_ctx.Alloc<Ty>(y);

auto x_dim = phi::vectorize<int>(x.dims());
Expand Down Expand Up @@ -1149,11 +1096,17 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
0);

#ifdef PADDLE_WITH_XPU_KP
auto grid_num = 8;
auto block_num = 64;
#else
auto grid_num = config.grid;
auto block_num = config.block;
#endif
ReduceHigherDimKernel<Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<8, 64, 0, stream>>>(
TransformOp><<<grid_num, block_num, 0, stream>>>(
x_data,
config.output_data,
reducer,
Expand All @@ -1163,23 +1116,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num,
config.blocking_size,
dim);
#else
ReduceHigherDimKernel<
Tx,
Ty,
MPType,
ReduceOp<MPType>,
TransformOp><<<config.grid, config.block, 0, stream>>>(
x_data,
config.output_data,
reducer,
transform,
reducer.initial(),
config.reduce_num,
config.left_num,
config.blocking_size,
dim);
#endif

if (config.should_reduce_again) {
dim3 block = dim3(config.block.x, 1, 1);
Expand All @@ -1189,22 +1125,9 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
dim2.SetRem(config.left_num % config.block.x, 0, 0);

#ifdef PADDLE_WITH_XPU_KP
ReduceHigherDimKernel<
Ty,
Ty,
MPType,
ReduceOp<MPType>,
kps::IdentityFunctor<Ty, MPType>><<<8, 64, 0, stream>>>(
config.output_data,
y_data,
reducer,
kps::IdentityFunctor<Ty, MPType>(config.grid.y),
reducer.initial(),
config.grid.y,
config.left_num,
config.grid.y,
dim2);
#else
grid = 8;
block = 64;
#endif
ReduceHigherDimKernel<
Ty,
Ty,
Expand All @@ -1220,7 +1143,6 @@ void ReduceKernel(const phi::GPUContext& dev_ctx,
config.left_num,
config.grid.y,
dim2);
#endif
}
return;
}
Expand Down
Loading

0 comments on commit c1c4288

Please sign in to comment.