[pnorm] fix bug in fp16 & optimize memory (PaddlePaddle#39011)

seemingwang · Jan 25, 2022 · 3825b40 · 3825b40
1 parent c1e5a39
commit 3825b40
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 97 deletions.
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
@@ -76,22 +76,13 @@ struct AbsFunctor {
   }
 };
 
-template <typename Tx, typename Ty = Tx>
+template <typename T>
 struct UnsignedPowFunctor {
   HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
     this->porder = porder;
   }
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
-  }
-  float porder;
-};
-
-template <typename Tx, typename Ty = Tx>
-struct PowFunctor {
-  HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; }
-  HOSTDEVICE inline Ty operator()(const Tx x) const {
-    return static_cast<Ty>(inline_pow(x, static_cast<Tx>(porder)));
+  HOSTDEVICE inline T operator()(const T x) const {
+    return static_cast<T>(inline_pow(inline_abs(x), static_cast<T>(porder)));
   }
   float porder;
 };
@@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
     const T* x = in_x->data<T>();
     T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
     auto xdim = in_x->dims();
-    auto ndim = out_norm->dims();
     float porder = ctx.Attr<float>("porder");
     bool asvector = ctx.Attr<bool>("asvector");
     int axis = ctx.Attr<int>("axis");
     std::vector<int> reduce_axis = {axis};
     reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector);
-
     auto stream = ctx.cuda_device_context().stream();
 
     using MT = typename details::MPTypeTrait<T>::Type;
@@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
       TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
           *in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
     } else {
-      framework::Tensor tmp_x;
-      tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
-      std::vector<const framework::Tensor*> ins = {in_x};
-      std::vector<framework::Tensor*> outs = {&tmp_x};
-      auto func = UnsignedPowFunctor<MT, T>(porder);
+      TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
+          *in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
+
+      const framework::Tensor* tmp_norm = out_norm;
+      std::vector<const framework::Tensor*> ins = {tmp_norm};
+      std::vector<framework::Tensor*> outs = {out_norm};
       const auto& cuda_ctx =
           ctx.template device_context<platform::CUDADeviceContext>();
-
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
-          cuda_ctx, ins, &outs, func);
-      framework::Tensor tmp_y;
-      tmp_y.mutable_data<T>(ndim, ctx.GetPlace());
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          tmp_x, &tmp_y, kps::IdentityFunctor<T>(), reduce_axis, stream);
-      const framework::Tensor* tmp_norm = &tmp_y;
-      ins = {tmp_norm};
-      outs = {out_norm};
-      auto func_inverse = UnsignedPowFunctor<MT, T>(1. / porder);
-
       paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
-          cuda_ctx, ins, &outs, func_inverse);
+          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
+          cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
     }
   }
 };
@@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor {
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto equals = ((*x).abs() == y->broadcast(dim));
-    auto ones = dx->constant(static_cast<T>(1.));
-    auto negs = dx->constant(static_cast<T>(-1.));
-    auto zeros = dx->constant(static_cast<T>(0.));
-    auto positives = (*x) > zeros;
-    dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) *
-                        positives.select(ones, negs);
+    dx->device(place) = dy->broadcast(dim) * (*x).sign() *
+                        ((*x).abs() == y->broadcast(dim)).template cast<T>();
   }
 };
 
 template <typename T>
-struct PNormPostGradFunctor {
+struct PNormGradFunctor {
+  HOSTDEVICE explicit inline PNormGradFunctor(float porder) {
+    this->porder = static_cast<T>(porder - 1.);
+  }
   template <typename DeviceContext, typename X, typename Y, typename DX,
             typename DY, typename Dim>
   void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
                   const Dim& dim, int size) {
-    auto ones = dx->constant(static_cast<T>(1.));
-    auto negs = dx->constant(static_cast<T>(-1.));
-    auto zeros = dx->constant(static_cast<T>(0.));
-    auto positives = (*x) > zeros;
-    dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) *
-                        positives.select(ones, negs);
+    dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() *
+                        dy->broadcast(dim) *
+                        (*y).pow(-this->porder).broadcast(dim);
   }
+  T porder;
 };
 
 template <typename DeviceContext, typename T, typename AttrType = T>
@@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
       math::SetConstant<DeviceContext, T> set_zero;
       set_zero(cuda_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
+      AbsMaxAndMinGradFunctor<T> functor;
       LaunchReduceGradKernel<DeviceContext, T, AbsMaxAndMinGradFunctor<T>>(
-          ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all);
+          ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
     } else {
-      framework::Tensor tmp_norm;
-      tmp_norm.mutable_data<T>(in_norm->dims(), ctx.GetPlace());
-      std::vector<const framework::Tensor*> ins = {in_norm};
-      std::vector<framework::Tensor*> outs = {&tmp_norm};
-      auto pow_functor = PowFunctor<T>(1. - porder);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T, PowFunctor<T>>(cuda_ctx, ins, &outs,
-                                                        pow_functor);
-      ins = {in_x};
-      outs = {out_dx};
-      auto unsigned_pow = UnsignedPowFunctor<T>(porder - 1.);
-      paddle::operators::LaunchSameDimsElementwiseCudaKernel<
-          ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
-          cuda_ctx, ins, &outs, unsigned_pow);
-      const framework::Tensor* tmp_norm_const = &tmp_norm;
-      LaunchReduceGradKernel<DeviceContext, T, PNormPostGradFunctor<T>>(
-          ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all);
+      auto functor = PNormGradFunctor<T>(porder);
+      LaunchReduceGradKernel<DeviceContext, T, PNormGradFunctor<T>>(
+          ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
     }
   }
 };

diff --git a/paddle/fluid/operators/reduce_ops/logsumexp_op.h b/paddle/fluid/operators/reduce_ops/logsumexp_op.h
@@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel<T> {
                              broadcast_dim[0]);
     } else {
       int rank = input->dims().size();
+      LogsumexpGradFunctor functor;
       switch (rank) {
         case 1:
           ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 2:
           ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 3:
           ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
         case 4:
           ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
               context.template device_context<DeviceContext>(), *input, *output,
-              *output_grad, input_grad, axis);
+              *output_grad, input_grad, functor, axis);
           break;
       }
     }

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
                         const framework::Tensor* x,
                         const framework::Tensor* out,
                         const framework::Tensor* dout, framework::Tensor* dx,
-                        const std::vector<int>& dims) {
+                        Functor functor, const std::vector<int>& dims) {
   const int64_t unreduced = out->numel();
   const int64_t reduced = x->numel() / unreduced;
   DDim out_dim(out->dims());
@@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   dx->Resize({unreduced, reduced});
   ReduceGradFunctor<DeviceContext, T, 2, Functor>(
       context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
-      dx, {1});
+      dx, functor, {1});
   // transpose dX
   std::vector<int> origin_axis(x_dim.size());
   GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
@@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
                             const framework::Tensor* input0,
                             const framework::Tensor* input1,
                             const framework::Tensor* input2,
-                            paddle::framework::Tensor* output,
+                            paddle::framework::Tensor* output, Functor functor,
                             const std::vector<int>& dims,
                             bool reduce_all = false) {
   if (reduce_all) {
@@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
         *context.template device_context<DeviceContext>().eigen_device();
     auto broadcast_dim =
         Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
-    Functor functor;
     functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
             broadcast_dim[0]);
   } else {
@@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
       case 1:
         ReduceGradFunctor<DeviceContext, T, 1, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 2:
         ReduceGradFunctor<DeviceContext, T, 2, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 3:
         ReduceGradFunctor<DeviceContext, T, 3, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 4:
         ReduceGradFunctor<DeviceContext, T, 4, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 5:
         ReduceGradFunctor<DeviceContext, T, 5, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       case 6:
         ReduceGradFunctor<DeviceContext, T, 6, Functor>(
             context.template device_context<DeviceContext>(), *input0, *input1,
-            *input2, output, dims);
+            *input2, output, functor, dims);
         break;
       default:
-        HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
-                                                      input2, output, dims);
+        HandleLargeDimGrad<DeviceContext, T, Functor>(
+            context, input0, input1, input2, output, functor, dims);
         break;
     }
   }
@@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
     // NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
     // not be set as Input in grad Maker, use Out_grad to replace here
     if (!input1) input1 = input2;
-    LaunchReduceGradKernel<DeviceContext, T, Functor>(
-        context, input0, input1, input2, output, const_dims, reduce_all);
+    Functor functor;
+    LaunchReduceGradKernel<DeviceContext, T, Functor>(context, input0, input1,
+                                                      input2, output, functor,
+                                                      const_dims, reduce_all);
   }
 
   void Compute(const framework::ExecutionContext& context) const override {

diff --git a/paddle/fluid/operators/reduce_ops/reduce_op_function.h b/paddle/fluid/operators/reduce_ops/reduce_op_function.h
@@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context,
                        const framework::Tensor& input0,
                        const framework::Tensor& input1,
                        const framework::Tensor& input2,
-                       framework::Tensor* output,
+                       framework::Tensor* output, Functor functor,
                        const std::vector<int>& dims) {
   auto x = EigenTensor<T, D>::From(input0);
   auto x_grad = EigenTensor<T, D>::From(*output);
@@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context,
 
   auto& place = *context.eigen_device();
 
-  Functor functor;
   functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
           broad_cats_times);
 }