Skip to content

Commit

Permalink
Add fused_scale_bias_add_relu
Browse files Browse the repository at this point in the history
  • Loading branch information
Tom-Zheng committed Nov 9, 2023
1 parent 7b8b8fa commit 99c5d30
Show file tree
Hide file tree
Showing 12 changed files with 433 additions and 3 deletions.
2 changes: 2 additions & 0 deletions paddle/fluid/pir/dialect/op_generator/ops_api_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
'fused_fc_elementwise_layernorm',
'fused_multi_transformer_xpu',
'fused_scale_bias_relu_conv_bn',
'fused_scale_bias_add_relu',
'fusion_transpose_flatten_concat',
'generate_sequence_xpu',
'layer_norm_act_xpu',
Expand Down Expand Up @@ -104,6 +105,7 @@
'fused_batch_norm_act_',
'fused_bn_add_activation_',
'fused_scale_bias_relu_conv_bn',
'fused_scale_bias_add_relu',
'memcpy',
'print',
'recv_v2',
Expand Down
10 changes: 10 additions & 0 deletions paddle/phi/api/yaml/fused_ops.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,16 @@
backward: fused_rotary_position_embedding_grad
support_dygraph_mode : true

- op : fused_scale_bias_add_relu
args : (Tensor x1, Tensor scale1, Tensor bias1, Tensor x2, Tensor scale2, Tensor bias2, bool fuse_dual, bool exhaustive_search)
optional : scale2, bias2
output : Tensor(y)
infer_meta :
func : FusedScaleBiasAddReluInferMeta
kernel :
func : fused_scale_bias_add_relu
data_type : x1

- op : fused_scale_bias_relu_conv_bn
args : (Tensor x, Tensor w, Tensor scale, Tensor bias, Tensor bn_scale, Tensor bn_bias, Tensor input_running_mean, Tensor input_running_var, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, str data_format, float momentum, float epsilon, bool fuse_prologue, bool exhaustive_search, int64_t accumulation_count = 0)
optional : scale, bias
Expand Down
26 changes: 26 additions & 0 deletions paddle/phi/infermeta/fusion.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1820,6 +1820,32 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
eq_bias->set_dims(c_dims);
}

void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1,
const MetaTensor& scale1,
const MetaTensor& bias1,
const MetaTensor& x2,
const MetaTensor& scale2,
const MetaTensor& bias2,
bool fuse_dual,
bool exhaustive_search,
MetaTensor* y) {
// check optional inputs
if (fuse_dual) {
bool has_scale2 = !!scale2;
bool has_bias2 = !!bias2;
PADDLE_ENFORCE(has_scale2 && has_bias2,
phi::errors::InvalidArgument(
"Argument scale2 and bias2 should be provided when "
"fuse_dual is set, but got has_scale2=%d, has_bias2=%d, "
"fuse_dual=%d.",
has_scale2,
has_bias2,
fuse_dual));
}
// set output dims
y->set_dims(x1.dims());
}

void SqueezeExcitationInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& filter_max,
Expand Down
10 changes: 10 additions & 0 deletions paddle/phi/infermeta/fusion.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,16 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x,
MetaTensor* eq_scale,
MetaTensor* eq_bias);

void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1,
const MetaTensor& scale1,
const MetaTensor& bias1,
const MetaTensor& x2,
const MetaTensor& scale2,
const MetaTensor& bias2,
bool fuse_prologue,
bool exhaustive_search,
MetaTensor* y);

void SqueezeExcitationInferMeta(const MetaTensor& x,
const MetaTensor& filter,
const MetaTensor& filter_max,
Expand Down
3 changes: 2 additions & 1 deletion paddle/phi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ endif()

if(NOT WITH_CUDNN_FRONTEND)
list(REMOVE_ITEM kernel_cu
"fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu")
"fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu"
"fusion/gpu/fused_scale_bias_add_relu_kernel.cu")
endif()

set(cc_search_pattern
Expand Down
3 changes: 3 additions & 0 deletions paddle/phi/kernels/autotune/cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ std::string AlgorithmTypeString(int64_t algo_type) {
return "scale_bias_relu_conv_bnstats";
} else if (algo_type == static_cast<int64_t>(AlgorithmType::kBNFinalize)) {
return "bn_finalize";
} else if (algo_type ==
static_cast<int64_t>(AlgorithmType::kScaleBiasAddRelu)) {
return "scale_bias_add_relu";
}
#endif
return std::to_string(algo_type);
Expand Down
5 changes: 3 additions & 2 deletions paddle/phi/kernels/autotune/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ enum class AlgorithmType {
kConvBackwardFilterV8 = 12,
kScaleBiasReluConvBNstats = 13,
kBNFinalize = 14,
kAlgorithmCount = 15
kScaleBiasAddRelu = 15,
kAlgorithmCount = 16
#endif
};

Expand Down Expand Up @@ -181,7 +182,7 @@ class AutoTuneCache {
}
#ifdef PADDLE_WITH_CUDNN_FRONTEND
} else if (algo_type >= AlgorithmType::kConvForwardV8 &&
algo_type <= AlgorithmType::kBNFinalize) {
algo_type < AlgorithmType::kAlgorithmCount) {
int64_t key = static_cast<int64_t>(algo_type);
if (cudnn_v8_auto_tune_map_.find(key) == cudnn_v8_auto_tune_map_.end()) {
CudnnFrontendPlanCache cache;
Expand Down
244 changes: 244 additions & 0 deletions paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */

#include <array>
#include <memory>

#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/core/flags.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/autotune/cache.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/batch_norm_utils.h"
#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h"

PHI_DECLARE_bool(cudnn_deterministic);
PHI_DECLARE_bool(cudnn_exhaustive_search);

namespace phi {
namespace fusion {

using helper = phi::CudnnFrontendConvHelper;

template <typename T>
using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;

template <typename T, typename Context>
void FusedScaleBiasAddReluKernel(const Context& dev_ctx,
const DenseTensor& x1,
const DenseTensor& scale1,
const DenseTensor& bias1,
const DenseTensor& x2,
const paddle::optional<DenseTensor>& scale2,
const paddle::optional<DenseTensor>& bias2,
bool fuse_dual,
bool exhaustive_search,
DenseTensor* y) {
PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(),
80,
phi::errors::PreconditionNotMet(
"This op only supports Ampere and later devices, "
"but got compute capability: %d.",
dev_ctx.GetComputeCapability()));
auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8(
phi::autotune::AlgorithmType::kScaleBiasAddRelu);

// exhaustive search
exhaustive_search = exhaustive_search || FLAGS_cudnn_exhaustive_search;
bool deterministic = FLAGS_cudnn_deterministic;
PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
false,
phi::errors::InvalidArgument(
"Cann't set exhaustive_search True and "
"FLAGS_cudnn_deterministic True at same time."));

// alloc output variables
dev_ctx.template Alloc<T>(y);

// get handles
auto handle = dev_ctx.cudnn_handle();
auto workspace_handle = dev_ctx.cudnn_workspace_handle();
// create tensor descriptors
cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC;
auto tensor_format = phi::backends::gpu::ToCudnnDataType(x1.dtype());
auto tensor_format_math = CUDNN_DATA_FLOAT;
auto compute_dtype = CUDNN_DATA_FLOAT;

auto dim_x =
phi::backends::gpu::TransformDimOrder(phi::vectorize<int64_t>(x1.dims()));
std::vector<int64_t> dim_c(dim_x.size(), 1);
dim_c[1] = dim_x[1]; // [1, C, 1, 1]

std::vector<void*> data_ptrs;
std::vector<int64_t> uids;
int64_t uid = 100;

// inputs
auto x1_desc = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format);
data_ptrs.push_back(const_cast<T*>(x1.data<T>()));
uids.push_back(uid);

auto x2_desc = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format);
data_ptrs.push_back(const_cast<T*>(x2.data<T>()));
uids.push_back(uid);

auto scale1_desc = helper::GetGeneralTensorDescriptor(
dim_c, layout_format, ++uid, 16, tensor_format);
data_ptrs.push_back(const_cast<T*>(scale1.data<T>()));
uids.push_back(uid);

auto bias1_desc = helper::GetGeneralTensorDescriptor(
dim_c, layout_format, ++uid, 16, tensor_format);
data_ptrs.push_back(const_cast<T*>(bias1.data<T>()));
uids.push_back(uid);

// dispensable inputs
auto scale2_desc = helper::GetGeneralTensorDescriptor(
dim_c, layout_format, ++uid, 16, tensor_format);
if (fuse_dual) {
data_ptrs.push_back(const_cast<T*>(scale2->data<T>()));
uids.push_back(uid);
}

auto bias2_desc = helper::GetGeneralTensorDescriptor(
dim_c, layout_format, ++uid, 16, tensor_format);
if (fuse_dual) {
data_ptrs.push_back(const_cast<T*>(bias2->data<T>()));
uids.push_back(uid);
}

// outputs
auto y_desc = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format);
data_ptrs.push_back(y->data<T>());
uids.push_back(uid);

// virtual outputs
auto after_scale1 = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format_math, true);

auto after_bias1 = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format_math, true);

auto after_scale2 = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format_math, true);

auto after_bias2 = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format_math, true);

auto after_add = helper::GetGeneralTensorDescriptor(
dim_x, layout_format, ++uid, 16, tensor_format_math, true);

// build ops
auto scale1_op = helper::MakePointwiseOp(
CUDNN_POINTWISE_MUL, compute_dtype, x1_desc, scale1_desc, after_scale1);

auto bias1_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
compute_dtype,
after_scale1,
bias1_desc,
after_bias1);

auto scale2_op = helper::MakePointwiseOp(
CUDNN_POINTWISE_MUL, compute_dtype, x2_desc, scale2_desc, after_scale2);

auto bias2_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
compute_dtype,
after_scale2,
bias2_desc,
after_bias2);

cudnn_frontend::Tensor* tensor_to_add = fuse_dual ? &after_bias2 : &x2_desc;

auto add_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD,
compute_dtype,
after_bias1,
*tensor_to_add,
after_add);

auto relu_desc = cudnn_frontend::PointWiseDescBuilder()
.setMode(CUDNN_POINTWISE_RELU_FWD)
.setComputeType(compute_dtype)
.build();

auto relu_op = cudnn_frontend::OperationBuilder(
CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR)
.setxDesc(after_add)
.setyDesc(y_desc)
.setpwDesc(relu_desc)
.build();

// build op graph
std::vector<cudnn_frontend::Operation const*> ops;
if (fuse_dual) {
ops = std::vector<cudnn_frontend::Operation const*>(
{&scale1_op, &bias1_op, &scale2_op, &bias2_op, &add_op, &relu_op});
} else {
ops = std::vector<cudnn_frontend::Operation const*>(
{&scale1_op, &bias1_op, &add_op, &relu_op});
}

auto op_graph = cudnn_frontend::OperationGraphBuilder()
.setHandle(handle)
.setOperationGraph(ops.size(), ops.data())
.build();
VLOG(6) << op_graph.describe();

cudnn_frontend::feature_vector_t feature_vector;
phi::autotune::BuildFeatureVector(&feature_vector, dim_x, fuse_dual);

if (plan_cache.FindPlan(feature_vector, handle)) {
const cudnn_frontend::ExecutionPlan* cached_plan = nullptr;
int64_t workspace_size = 0;
plan_cache.GetPlanAndWorkspaceSize(
feature_vector, &cached_plan, &workspace_size, handle);
helper::ExecutePlan(handle,
&workspace_handle,
&data_ptrs,
&uids,
cached_plan->get_raw_desc(),
workspace_size);
return;
}

auto plans = helper::FindExecutionPlans(&op_graph,
exhaustive_search,
deterministic,
&data_ptrs,
&uids,
handle,
&workspace_handle);

helper::ExecutePlansAndCache(handle,
&workspace_handle,
&data_ptrs,
&uids,
&plans,
exhaustive_search,
feature_vector,
&plan_cache);
}

} // namespace fusion
} // namespace phi

PD_REGISTER_KERNEL(fused_scale_bias_add_relu,
GPU,
ALL_LAYOUT,
phi::fusion::FusedScaleBiasAddReluKernel,
phi::dtype::float16) {}
1 change: 1 addition & 0 deletions test/legacy_test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,7 @@ endif()

if(NOT WITH_CUDNN_FRONTEND)
list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_relu_conv_bn_op)
list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_add_relu_op)
endif()

# Some ops need to check results when gc is enabled
Expand Down
Loading

0 comments on commit 99c5d30

Please sign in to comment.