Add fused_scale_bias_add_relu

PaddlePaddle · Nov 9, 2023 · 99c5d30 · 99c5d30
1 parent 7b8b8fa
commit 99c5d30
Show file tree

Hide file tree

Showing 12 changed files with 433 additions and 3 deletions.
diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
@@ -74,6 +74,7 @@
     'fused_fc_elementwise_layernorm',
     'fused_multi_transformer_xpu',
     'fused_scale_bias_relu_conv_bn',
+    'fused_scale_bias_add_relu',
     'fusion_transpose_flatten_concat',
     'generate_sequence_xpu',
     'layer_norm_act_xpu',
@@ -104,6 +105,7 @@
     'fused_batch_norm_act_',
     'fused_bn_add_activation_',
     'fused_scale_bias_relu_conv_bn',
+    'fused_scale_bias_add_relu',
     'memcpy',
     'print',
     'recv_v2',

diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
@@ -206,6 +206,16 @@
   backward: fused_rotary_position_embedding_grad
   support_dygraph_mode : true
 
+- op : fused_scale_bias_add_relu
+  args : (Tensor x1, Tensor scale1, Tensor bias1, Tensor x2, Tensor scale2, Tensor bias2, bool fuse_dual, bool exhaustive_search)
+  optional : scale2, bias2
+  output : Tensor(y)
+  infer_meta :
+    func : FusedScaleBiasAddReluInferMeta
+  kernel :
+    func : fused_scale_bias_add_relu
+    data_type : x1
+
 - op : fused_scale_bias_relu_conv_bn
   args : (Tensor x, Tensor w, Tensor scale, Tensor bias, Tensor bn_scale, Tensor bn_bias, Tensor input_running_mean, Tensor input_running_var, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, str data_format, float momentum, float epsilon, bool fuse_prologue, bool exhaustive_search, int64_t accumulation_count = 0)
   optional : scale, bias

diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
@@ -1820,6 +1820,32 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
   eq_bias->set_dims(c_dims);
 }
 
+void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1,
+                                    const MetaTensor& scale1,
+                                    const MetaTensor& bias1,
+                                    const MetaTensor& x2,
+                                    const MetaTensor& scale2,
+                                    const MetaTensor& bias2,
+                                    bool fuse_dual,
+                                    bool exhaustive_search,
+                                    MetaTensor* y) {
+  // check optional inputs
+  if (fuse_dual) {
+    bool has_scale2 = !!scale2;
+    bool has_bias2 = !!bias2;
+    PADDLE_ENFORCE(has_scale2 && has_bias2,
+                   phi::errors::InvalidArgument(
+                       "Argument scale2 and bias2 should be provided when "
+                       "fuse_dual is set, but got has_scale2=%d, has_bias2=%d, "
+                       "fuse_dual=%d.",
+                       has_scale2,
+                       has_bias2,
+                       fuse_dual));
+  }
+  // set output dims
+  y->set_dims(x1.dims());
+}
+
 void SqueezeExcitationInferMeta(const MetaTensor& x,
                                 const MetaTensor& filter,
                                 const MetaTensor& filter_max,

diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
@@ -478,6 +478,16 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
                                        MetaTensor* eq_scale,
                                        MetaTensor* eq_bias);
 
+void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1,
+                                    const MetaTensor& scale1,
+                                    const MetaTensor& bias1,
+                                    const MetaTensor& x2,
+                                    const MetaTensor& scale2,
+                                    const MetaTensor& bias2,
+                                    bool fuse_prologue,
+                                    bool exhaustive_search,
+                                    MetaTensor* y);
+
 void SqueezeExcitationInferMeta(const MetaTensor& x,
                                 const MetaTensor& filter,
                                 const MetaTensor& filter_max,

diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
@@ -108,7 +108,8 @@ endif()
 
 if(NOT WITH_CUDNN_FRONTEND)
   list(REMOVE_ITEM kernel_cu
-       "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu")
+       "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
+       "fusion/gpu/fused_scale_bias_add_relu_kernel.cu")
 endif()
 
 set(cc_search_pattern

diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc
@@ -52,6 +52,9 @@ std::string AlgorithmTypeString(int64_t algo_type) {
     return "scale_bias_relu_conv_bnstats";
   } else if (algo_type == static_cast<int64_t>(AlgorithmType::kBNFinalize)) {
     return "bn_finalize";
+  } else if (algo_type ==
+             static_cast<int64_t>(AlgorithmType::kScaleBiasAddRelu)) {
+    return "scale_bias_add_relu";
   }
 #endif
   return std::to_string(algo_type);

diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
@@ -57,7 +57,8 @@ enum class AlgorithmType {
   kConvBackwardFilterV8 = 12,
   kScaleBiasReluConvBNstats = 13,
   kBNFinalize = 14,
-  kAlgorithmCount = 15
+  kScaleBiasAddRelu = 15,
+  kAlgorithmCount = 16
 #endif
 };
 
@@ -181,7 +182,7 @@ class AutoTuneCache {
       }
 #ifdef PADDLE_WITH_CUDNN_FRONTEND
     } else if (algo_type >= AlgorithmType::kConvForwardV8 &&
-               algo_type <= AlgorithmType::kBNFinalize) {
+               algo_type < AlgorithmType::kAlgorithmCount) {
       int64_t key = static_cast<int64_t>(algo_type);
       if (cudnn_v8_auto_tune_map_.find(key) == cudnn_v8_auto_tune_map_.end()) {
         CudnnFrontendPlanCache cache;

diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
@@ -0,0 +1,244 @@
+/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <array>
+#include <memory>
+
+#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/flags.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/autotune/cache.h"
+#include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
+#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h"
+
+PHI_DECLARE_bool(cudnn_deterministic);
+PHI_DECLARE_bool(cudnn_exhaustive_search);
+
+namespace phi {
+namespace fusion {
+
+using helper = phi::CudnnFrontendConvHelper;
+
+template <typename T>
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;
+
+template <typename T, typename Context>
+void FusedScaleBiasAddReluKernel(const Context& dev_ctx,
+                                 const DenseTensor& x1,
+                                 const DenseTensor& scale1,
+                                 const DenseTensor& bias1,
+                                 const DenseTensor& x2,
+                                 const paddle::optional<DenseTensor>& scale2,
+                                 const paddle::optional<DenseTensor>& bias2,
+                                 bool fuse_dual,
+                                 bool exhaustive_search,
+                                 DenseTensor* y) {
+  PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(),
+                    80,
+                    phi::errors::PreconditionNotMet(
+                        "This op only supports Ampere and later devices, "
+                        "but got compute capability: %d.",
+                        dev_ctx.GetComputeCapability()));
+  auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8(
+      phi::autotune::AlgorithmType::kScaleBiasAddRelu);
+
+  // exhaustive search
+  exhaustive_search = exhaustive_search || FLAGS_cudnn_exhaustive_search;
+  bool deterministic = FLAGS_cudnn_deterministic;
+  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
+                    false,
+                    phi::errors::InvalidArgument(
+                        "Cann't set exhaustive_search True and "
+                        "FLAGS_cudnn_deterministic True at same time."));
+
+  // alloc output variables
+  dev_ctx.template Alloc<T>(y);
+
+  // get handles
+  auto handle = dev_ctx.cudnn_handle();
+  auto workspace_handle = dev_ctx.cudnn_workspace_handle();
+  // create tensor descriptors
+  cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
+  auto tensor_format = phi::backends::gpu::ToCudnnDataType(x1.dtype());
+  auto tensor_format_math = CUDNN_DATA_FLOAT;
+  auto compute_dtype = CUDNN_DATA_FLOAT;
+
+  auto dim_x =
+      phi::backends::gpu::TransformDimOrder(phi::vectorize<int64_t>(x1.dims()));
+  std::vector<int64_t> dim_c(dim_x.size(), 1);
+  dim_c[1] = dim_x[1];  //  [1, C, 1, 1]
+
+  std::vector<void*> data_ptrs;
+  std::vector<int64_t> uids;
+  int64_t uid = 100;
+
+  // inputs
+  auto x1_desc = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(const_cast<T*>(x1.data<T>()));
+  uids.push_back(uid);
+
+  auto x2_desc = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(const_cast<T*>(x2.data<T>()));
+  uids.push_back(uid);
+
+  auto scale1_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(const_cast<T*>(scale1.data<T>()));
+  uids.push_back(uid);
+
+  auto bias1_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(const_cast<T*>(bias1.data<T>()));
+  uids.push_back(uid);
+
+  // dispensable inputs
+  auto scale2_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  if (fuse_dual) {
+    data_ptrs.push_back(const_cast<T*>(scale2->data<T>()));
+    uids.push_back(uid);
+  }
+
+  auto bias2_desc = helper::GetGeneralTensorDescriptor(
+      dim_c, layout_format, ++uid, 16, tensor_format);
+  if (fuse_dual) {
+    data_ptrs.push_back(const_cast<T*>(bias2->data<T>()));
+    uids.push_back(uid);
+  }
+
+  // outputs
+  auto y_desc = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format);
+  data_ptrs.push_back(y->data<T>());
+  uids.push_back(uid);
+
+  // virtual outputs
+  auto after_scale1 = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  auto after_bias1 = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  auto after_scale2 = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  auto after_bias2 = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  auto after_add = helper::GetGeneralTensorDescriptor(
+      dim_x, layout_format, ++uid, 16, tensor_format_math, true);
+
+  // build ops
+  auto scale1_op = helper::MakePointwiseOp(
+      CUDNN_POINTWISE_MUL, compute_dtype, x1_desc, scale1_desc, after_scale1);
+
+  auto bias1_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
+                                          compute_dtype,
+                                          after_scale1,
+                                          bias1_desc,
+                                          after_bias1);
+
+  auto scale2_op = helper::MakePointwiseOp(
+      CUDNN_POINTWISE_MUL, compute_dtype, x2_desc, scale2_desc, after_scale2);
+
+  auto bias2_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
+                                          compute_dtype,
+                                          after_scale2,
+                                          bias2_desc,
+                                          after_bias2);
+
+  cudnn_frontend::Tensor* tensor_to_add = fuse_dual ? &after_bias2 : &x2_desc;
+
+  auto add_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
+                                        compute_dtype,
+                                        after_bias1,
+                                        *tensor_to_add,
+                                        after_add);
+
+  auto relu_desc = cudnn_frontend::PointWiseDescBuilder()
+                       .setMode(CUDNN_POINTWISE_RELU_FWD)
+                       .setComputeType(compute_dtype)
+                       .build();
+
+  auto relu_op = cudnn_frontend::OperationBuilder(
+                     CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
+                     .setxDesc(after_add)
+                     .setyDesc(y_desc)
+                     .setpwDesc(relu_desc)
+                     .build();
+
+  // build op graph
+  std::vector<cudnn_frontend::Operation const*> ops;
+  if (fuse_dual) {
+    ops = std::vector<cudnn_frontend::Operation const*>(
+        {&scale1_op, &bias1_op, &scale2_op, &bias2_op, &add_op, &relu_op});
+  } else {
+    ops = std::vector<cudnn_frontend::Operation const*>(
+        {&scale1_op, &bias1_op, &add_op, &relu_op});
+  }
+
+  auto op_graph = cudnn_frontend::OperationGraphBuilder()
+                      .setHandle(handle)
+                      .setOperationGraph(ops.size(), ops.data())
+                      .build();
+  VLOG(6) << op_graph.describe();
+
+  cudnn_frontend::feature_vector_t feature_vector;
+  phi::autotune::BuildFeatureVector(&feature_vector, dim_x, fuse_dual);
+
+  if (plan_cache.FindPlan(feature_vector, handle)) {
+    const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
+    int64_t workspace_size = 0;
+    plan_cache.GetPlanAndWorkspaceSize(
+        feature_vector, &cached_plan, &workspace_size, handle);
+    helper::ExecutePlan(handle,
+                        &workspace_handle,
+                        &data_ptrs,
+                        &uids,
+                        cached_plan->get_raw_desc(),
+                        workspace_size);
+    return;
+  }
+
+  auto plans = helper::FindExecutionPlans(&op_graph,
+                                          exhaustive_search,
+                                          deterministic,
+                                          &data_ptrs,
+                                          &uids,
+                                          handle,
+                                          &workspace_handle);
+
+  helper::ExecutePlansAndCache(handle,
+                               &workspace_handle,
+                               &data_ptrs,
+                               &uids,
+                               &plans,
+                               exhaustive_search,
+                               feature_vector,
+                               &plan_cache);
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_scale_bias_add_relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::FusedScaleBiasAddReluKernel,
+                   phi::dtype::float16) {}
diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt
@@ -505,6 +505,7 @@ endif()
 
 if(NOT WITH_CUDNN_FRONTEND)
   list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_relu_conv_bn_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_add_relu_op)
 endif()
 
 # Some ops need to check results when gc is enabled