Skip to content

Commit

Permalink
[pnorm] fix bug in fp16 & optimize memory (PaddlePaddle#39011)
Browse files Browse the repository at this point in the history
  • Loading branch information
LemonNoel authored Jan 25, 2022
1 parent c1e5a39 commit 3825b40
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 97 deletions.
92 changes: 26 additions & 66 deletions paddle/fluid/operators/p_norm_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,22 +76,13 @@ struct AbsFunctor {
}
};

template <typename Tx, typename Ty = Tx>
template <typename T>
struct UnsignedPowFunctor {
HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
this->porder = porder;
}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
}
float porder;
};

template <typename Tx, typename Ty = Tx>
struct PowFunctor {
HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; }
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(inline_pow(x, static_cast<Tx>(porder)));
HOSTDEVICE inline T operator()(const T x) const {
return static_cast<T>(inline_pow(inline_abs(x), static_cast<T>(porder)));
}
float porder;
};
Expand All @@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
const T* x = in_x->data<T>();
T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
auto xdim = in_x->dims();
auto ndim = out_norm->dims();
float porder = ctx.Attr<float>("porder");
bool asvector = ctx.Attr<bool>("asvector");
int axis = ctx.Attr<int>("axis");
std::vector<int> reduce_axis = {axis};
reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector);

auto stream = ctx.cuda_device_context().stream();

using MT = typename details::MPTypeTrait<T>::Type;
Expand All @@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
} else {
framework::Tensor tmp_x;
tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
std::vector<const framework::Tensor*> ins = {in_x};
std::vector<framework::Tensor*> outs = {&tmp_x};
auto func = UnsignedPowFunctor<MT, T>(porder);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);

const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm};
std::vector<framework::Tensor*> outs = {out_norm};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();

paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func);
framework::Tensor tmp_y;
tmp_y.mutable_data<T>(ndim, ctx.GetPlace());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
tmp_x, &tmp_y, kps::IdentityFunctor<T>(), reduce_axis, stream);
const framework::Tensor* tmp_norm = &tmp_y;
ins = {tmp_norm};
outs = {out_norm};
auto func_inverse = UnsignedPowFunctor<MT, T>(1. / porder);

paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func_inverse);
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
}
}
};
Expand All @@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor {
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = ((*x).abs() == y->broadcast(dim));
auto ones = dx->constant(static_cast<T>(1.));
auto negs = dx->constant(static_cast<T>(-1.));
auto zeros = dx->constant(static_cast<T>(0.));
auto positives = (*x) > zeros;
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) *
positives.select(ones, negs);
dx->device(place) = dy->broadcast(dim) * (*x).sign() *
((*x).abs() == y->broadcast(dim)).template cast<T>();
}
};

template <typename T>
struct PNormPostGradFunctor {
struct PNormGradFunctor {
HOSTDEVICE explicit inline PNormGradFunctor(float porder) {
this->porder = static_cast<T>(porder - 1.);
}
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto ones = dx->constant(static_cast<T>(1.));
auto negs = dx->constant(static_cast<T>(-1.));
auto zeros = dx->constant(static_cast<T>(0.));
auto positives = (*x) > zeros;
dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) *
positives.select(ones, negs);
dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() *
dy->broadcast(dim) *
(*y).pow(-this->porder).broadcast(dim);
}
T porder;
};

template <typename DeviceContext, typename T, typename AttrType = T>
Expand All @@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
math::SetConstant<DeviceContext, T> set_zero;
set_zero(cuda_ctx, out_dx, static_cast<T>(0));
} else if (porder == INFINITY || porder == -INFINITY) {
AbsMaxAndMinGradFunctor<T> functor;
LaunchReduceGradKernel<DeviceContext, T, AbsMaxAndMinGradFunctor<T>>(
ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all);
ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
} else {
framework::Tensor tmp_norm;
tmp_norm.mutable_data<T>(in_norm->dims(), ctx.GetPlace());
std::vector<const framework::Tensor*> ins = {in_norm};
std::vector<framework::Tensor*> outs = {&tmp_norm};
auto pow_functor = PowFunctor<T>(1. - porder);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T, PowFunctor<T>>(cuda_ctx, ins, &outs,
pow_functor);
ins = {in_x};
outs = {out_dx};
auto unsigned_pow = UnsignedPowFunctor<T>(porder - 1.);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
cuda_ctx, ins, &outs, unsigned_pow);
const framework::Tensor* tmp_norm_const = &tmp_norm;
LaunchReduceGradKernel<DeviceContext, T, PNormPostGradFunctor<T>>(
ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all);
auto functor = PNormGradFunctor<T>(porder);
LaunchReduceGradKernel<DeviceContext, T, PNormGradFunctor<T>>(
ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
}
}
};
Expand Down
9 changes: 5 additions & 4 deletions paddle/fluid/operators/reduce_ops/logsumexp_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel<T> {
broadcast_dim[0]);
} else {
int rank = input->dims().size();
LogsumexpGradFunctor functor;
switch (rank) {
case 1:
ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 2:
ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 3:
ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 4:
ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
}
}
Expand Down
29 changes: 15 additions & 14 deletions paddle/fluid/operators/reduce_ops/reduce_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
const framework::Tensor* x,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
const std::vector<int>& dims) {
Functor functor, const std::vector<int>& dims) {
const int64_t unreduced = out->numel();
const int64_t reduced = x->numel() / unreduced;
DDim out_dim(out->dims());
Expand All @@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
dx->Resize({unreduced, reduced});
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
dx, {1});
dx, functor, {1});
// transpose dX
std::vector<int> origin_axis(x_dim.size());
GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
Expand Down Expand Up @@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
const framework::Tensor* input0,
const framework::Tensor* input1,
const framework::Tensor* input2,
paddle::framework::Tensor* output,
paddle::framework::Tensor* output, Functor functor,
const std::vector<int>& dims,
bool reduce_all = false) {
if (reduce_all) {
Expand All @@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
*context.template device_context<DeviceContext>().eigen_device();
auto broadcast_dim =
Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broadcast_dim[0]);
} else {
Expand All @@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
case 1:
ReduceGradFunctor<DeviceContext, T, 1, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 2:
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 3:
ReduceGradFunctor<DeviceContext, T, 3, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 4:
ReduceGradFunctor<DeviceContext, T, 4, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 5:
ReduceGradFunctor<DeviceContext, T, 5, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 6:
ReduceGradFunctor<DeviceContext, T, 6, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
default:
HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
input2, output, dims);
HandleLargeDimGrad<DeviceContext, T, Functor>(
context, input0, input1, input2, output, functor, dims);
break;
}
}
Expand Down Expand Up @@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
// NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
// not be set as Input in grad Maker, use Out_grad to replace here
if (!input1) input1 = input2;
LaunchReduceGradKernel<DeviceContext, T, Functor>(
context, input0, input1, input2, output, const_dims, reduce_all);
Functor functor;
LaunchReduceGradKernel<DeviceContext, T, Functor>(context, input0, input1,
input2, output, functor,
const_dims, reduce_all);
}

void Compute(const framework::ExecutionContext& context) const override {
Expand Down
3 changes: 1 addition & 2 deletions paddle/fluid/operators/reduce_ops/reduce_op_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context,
const framework::Tensor& input0,
const framework::Tensor& input1,
const framework::Tensor& input2,
framework::Tensor* output,
framework::Tensor* output, Functor functor,
const std::vector<int>& dims) {
auto x = EigenTensor<T, D>::From(input0);
auto x_grad = EigenTensor<T, D>::From(*output);
Expand All @@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context,

auto& place = *context.eigen_device();

Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broad_cats_times);
}
Expand Down
Loading

0 comments on commit 3825b40

Please sign in to comment.