From 99c5d30fabc147ae4d64d1b9b123bdbcddecac72 Mon Sep 17 00:00:00 2001 From: "Tian Zheng (Engrg-Hardware 1)" Date: Sun, 29 Oct 2023 07:35:32 +0000 Subject: [PATCH] Add fused_scale_bias_add_relu --- .../pir/dialect/op_generator/ops_api_gen.py | 2 + paddle/phi/api/yaml/fused_ops.yaml | 10 + paddle/phi/infermeta/fusion.cc | 26 ++ paddle/phi/infermeta/fusion.h | 10 + paddle/phi/kernels/CMakeLists.txt | 3 +- paddle/phi/kernels/autotune/cache.cc | 3 + paddle/phi/kernels/autotune/cache.h | 5 +- .../gpu/fused_scale_bias_add_relu_kernel.cu | 244 ++++++++++++++++++ test/legacy_test/CMakeLists.txt | 1 + .../test_fused_scale_bias_add_relu_op.py | 130 ++++++++++ test/white_list/op_accuracy_white_list.py | 1 + tools/gpups_test.sh | 1 + 12 files changed, 433 insertions(+), 3 deletions(-) create mode 100644 paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu create mode 100644 test/legacy_test/test_fused_scale_bias_add_relu_op.py diff --git a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py index e265b7c44e03c..642cf224c5c18 100644 --- a/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py +++ b/paddle/fluid/pir/dialect/op_generator/ops_api_gen.py @@ -74,6 +74,7 @@ 'fused_fc_elementwise_layernorm', 'fused_multi_transformer_xpu', 'fused_scale_bias_relu_conv_bn', + 'fused_scale_bias_add_relu', 'fusion_transpose_flatten_concat', 'generate_sequence_xpu', 'layer_norm_act_xpu', @@ -104,6 +105,7 @@ 'fused_batch_norm_act_', 'fused_bn_add_activation_', 'fused_scale_bias_relu_conv_bn', + 'fused_scale_bias_add_relu', 'memcpy', 'print', 'recv_v2', diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml index ba5e5bbaa07ab..0e764467ddc3c 100644 --- a/paddle/phi/api/yaml/fused_ops.yaml +++ b/paddle/phi/api/yaml/fused_ops.yaml @@ -206,6 +206,16 @@ backward: fused_rotary_position_embedding_grad support_dygraph_mode : true +- op : fused_scale_bias_add_relu + args : (Tensor x1, Tensor scale1, Tensor bias1, Tensor x2, Tensor scale2, Tensor bias2, bool fuse_dual, bool exhaustive_search) + optional : scale2, bias2 + output : Tensor(y) + infer_meta : + func : FusedScaleBiasAddReluInferMeta + kernel : + func : fused_scale_bias_add_relu + data_type : x1 + - op : fused_scale_bias_relu_conv_bn args : (Tensor x, Tensor w, Tensor scale, Tensor bias, Tensor bn_scale, Tensor bn_bias, Tensor input_running_mean, Tensor input_running_var, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, str data_format, float momentum, float epsilon, bool fuse_prologue, bool exhaustive_search, int64_t accumulation_count = 0) optional : scale, bias diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 06a457ccaba09..e6fa399e3b7c5 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -1820,6 +1820,32 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x, eq_bias->set_dims(c_dims); } +void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1, + const MetaTensor& scale1, + const MetaTensor& bias1, + const MetaTensor& x2, + const MetaTensor& scale2, + const MetaTensor& bias2, + bool fuse_dual, + bool exhaustive_search, + MetaTensor* y) { + // check optional inputs + if (fuse_dual) { + bool has_scale2 = !!scale2; + bool has_bias2 = !!bias2; + PADDLE_ENFORCE(has_scale2 && has_bias2, + phi::errors::InvalidArgument( + "Argument scale2 and bias2 should be provided when " + "fuse_dual is set, but got has_scale2=%d, has_bias2=%d, " + "fuse_dual=%d.", + has_scale2, + has_bias2, + fuse_dual)); + } + // set output dims + y->set_dims(x1.dims()); +} + void SqueezeExcitationInferMeta(const MetaTensor& x, const MetaTensor& filter, const MetaTensor& filter_max, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index 5b3de8aea0d60..7b1508d5a639c 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -478,6 +478,16 @@ void FusedScaleBiasReluConvBnInferMeta(const MetaTensor& x, MetaTensor* eq_scale, MetaTensor* eq_bias); +void FusedScaleBiasAddReluInferMeta(const MetaTensor& x1, + const MetaTensor& scale1, + const MetaTensor& bias1, + const MetaTensor& x2, + const MetaTensor& scale2, + const MetaTensor& bias2, + bool fuse_prologue, + bool exhaustive_search, + MetaTensor* y); + void SqueezeExcitationInferMeta(const MetaTensor& x, const MetaTensor& filter, const MetaTensor& filter_max, diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt index 617095fbff4d1..5f5330df4dfac 100644 --- a/paddle/phi/kernels/CMakeLists.txt +++ b/paddle/phi/kernels/CMakeLists.txt @@ -108,7 +108,8 @@ endif() if(NOT WITH_CUDNN_FRONTEND) list(REMOVE_ITEM kernel_cu - "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu") + "fusion/gpu/fused_scale_bias_relu_conv_bn_kernel.cu" + "fusion/gpu/fused_scale_bias_add_relu_kernel.cu") endif() set(cc_search_pattern diff --git a/paddle/phi/kernels/autotune/cache.cc b/paddle/phi/kernels/autotune/cache.cc index ba48e2e00ce54..0f1c9171264d1 100644 --- a/paddle/phi/kernels/autotune/cache.cc +++ b/paddle/phi/kernels/autotune/cache.cc @@ -52,6 +52,9 @@ std::string AlgorithmTypeString(int64_t algo_type) { return "scale_bias_relu_conv_bnstats"; } else if (algo_type == static_cast(AlgorithmType::kBNFinalize)) { return "bn_finalize"; + } else if (algo_type == + static_cast(AlgorithmType::kScaleBiasAddRelu)) { + return "scale_bias_add_relu"; } #endif return std::to_string(algo_type); diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h index 34b98e28f50c7..ff47bfffcc448 100644 --- a/paddle/phi/kernels/autotune/cache.h +++ b/paddle/phi/kernels/autotune/cache.h @@ -57,7 +57,8 @@ enum class AlgorithmType { kConvBackwardFilterV8 = 12, kScaleBiasReluConvBNstats = 13, kBNFinalize = 14, - kAlgorithmCount = 15 + kScaleBiasAddRelu = 15, + kAlgorithmCount = 16 #endif }; @@ -181,7 +182,7 @@ class AutoTuneCache { } #ifdef PADDLE_WITH_CUDNN_FRONTEND } else if (algo_type >= AlgorithmType::kConvForwardV8 && - algo_type <= AlgorithmType::kBNFinalize) { + algo_type < AlgorithmType::kAlgorithmCount) { int64_t key = static_cast(algo_type); if (cudnn_v8_auto_tune_map_.find(key) == cudnn_v8_auto_tune_map_.end()) { CudnnFrontendPlanCache cache; diff --git a/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu new file mode 100644 index 0000000000000..ff5edd689f7f3 --- /dev/null +++ b/paddle/phi/kernels/fusion/gpu/fused_scale_bias_add_relu_kernel.cu @@ -0,0 +1,244 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/phi/backends/gpu/cuda/cudnn_helper.h" +#include "paddle/phi/backends/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_info.h" +#include "paddle/phi/core/flags.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/autotune/cache.h" +#include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/batch_norm_utils.h" +#include "paddle/phi/kernels/gpudnn/conv_cudnn_frontend.h" + +PHI_DECLARE_bool(cudnn_deterministic); +PHI_DECLARE_bool(cudnn_exhaustive_search); + +namespace phi { +namespace fusion { + +using helper = phi::CudnnFrontendConvHelper; + +template +using CudnnDataType = phi::backends::gpu::CudnnDataType; + +template +void FusedScaleBiasAddReluKernel(const Context& dev_ctx, + const DenseTensor& x1, + const DenseTensor& scale1, + const DenseTensor& bias1, + const DenseTensor& x2, + const paddle::optional& scale2, + const paddle::optional& bias2, + bool fuse_dual, + bool exhaustive_search, + DenseTensor* y) { + PADDLE_ENFORCE_GE(dev_ctx.GetComputeCapability(), + 80, + phi::errors::PreconditionNotMet( + "This op only supports Ampere and later devices, " + "but got compute capability: %d.", + dev_ctx.GetComputeCapability())); + auto& plan_cache = phi::autotune::AutoTuneCache::Instance().GetConvV8( + phi::autotune::AlgorithmType::kScaleBiasAddRelu); + + // exhaustive search + exhaustive_search = exhaustive_search || FLAGS_cudnn_exhaustive_search; + bool deterministic = FLAGS_cudnn_deterministic; + PADDLE_ENFORCE_EQ(exhaustive_search && deterministic, + false, + phi::errors::InvalidArgument( + "Cann't set exhaustive_search True and " + "FLAGS_cudnn_deterministic True at same time.")); + + // alloc output variables + dev_ctx.template Alloc(y); + + // get handles + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + // create tensor descriptors + cudnnTensorFormat_t layout_format = CUDNN_TENSOR_NHWC; + auto tensor_format = phi::backends::gpu::ToCudnnDataType(x1.dtype()); + auto tensor_format_math = CUDNN_DATA_FLOAT; + auto compute_dtype = CUDNN_DATA_FLOAT; + + auto dim_x = + phi::backends::gpu::TransformDimOrder(phi::vectorize(x1.dims())); + std::vector dim_c(dim_x.size(), 1); + dim_c[1] = dim_x[1]; // [1, C, 1, 1] + + std::vector data_ptrs; + std::vector uids; + int64_t uid = 100; + + // inputs + auto x1_desc = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format); + data_ptrs.push_back(const_cast(x1.data())); + uids.push_back(uid); + + auto x2_desc = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format); + data_ptrs.push_back(const_cast(x2.data())); + uids.push_back(uid); + + auto scale1_desc = helper::GetGeneralTensorDescriptor( + dim_c, layout_format, ++uid, 16, tensor_format); + data_ptrs.push_back(const_cast(scale1.data())); + uids.push_back(uid); + + auto bias1_desc = helper::GetGeneralTensorDescriptor( + dim_c, layout_format, ++uid, 16, tensor_format); + data_ptrs.push_back(const_cast(bias1.data())); + uids.push_back(uid); + + // dispensable inputs + auto scale2_desc = helper::GetGeneralTensorDescriptor( + dim_c, layout_format, ++uid, 16, tensor_format); + if (fuse_dual) { + data_ptrs.push_back(const_cast(scale2->data())); + uids.push_back(uid); + } + + auto bias2_desc = helper::GetGeneralTensorDescriptor( + dim_c, layout_format, ++uid, 16, tensor_format); + if (fuse_dual) { + data_ptrs.push_back(const_cast(bias2->data())); + uids.push_back(uid); + } + + // outputs + auto y_desc = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format); + data_ptrs.push_back(y->data()); + uids.push_back(uid); + + // virtual outputs + auto after_scale1 = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format_math, true); + + auto after_bias1 = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format_math, true); + + auto after_scale2 = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format_math, true); + + auto after_bias2 = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format_math, true); + + auto after_add = helper::GetGeneralTensorDescriptor( + dim_x, layout_format, ++uid, 16, tensor_format_math, true); + + // build ops + auto scale1_op = helper::MakePointwiseOp( + CUDNN_POINTWISE_MUL, compute_dtype, x1_desc, scale1_desc, after_scale1); + + auto bias1_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD, + compute_dtype, + after_scale1, + bias1_desc, + after_bias1); + + auto scale2_op = helper::MakePointwiseOp( + CUDNN_POINTWISE_MUL, compute_dtype, x2_desc, scale2_desc, after_scale2); + + auto bias2_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD, + compute_dtype, + after_scale2, + bias2_desc, + after_bias2); + + cudnn_frontend::Tensor* tensor_to_add = fuse_dual ? &after_bias2 : &x2_desc; + + auto add_op = helper::MakePointwiseOp(CUDNN_POINTWISE_ADD, + compute_dtype, + after_bias1, + *tensor_to_add, + after_add); + + auto relu_desc = cudnn_frontend::PointWiseDescBuilder() + .setMode(CUDNN_POINTWISE_RELU_FWD) + .setComputeType(compute_dtype) + .build(); + + auto relu_op = cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(after_add) + .setyDesc(y_desc) + .setpwDesc(relu_desc) + .build(); + + // build op graph + std::vector ops; + if (fuse_dual) { + ops = std::vector( + {&scale1_op, &bias1_op, &scale2_op, &bias2_op, &add_op, &relu_op}); + } else { + ops = std::vector( + {&scale1_op, &bias1_op, &add_op, &relu_op}); + } + + auto op_graph = cudnn_frontend::OperationGraphBuilder() + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); + VLOG(6) << op_graph.describe(); + + cudnn_frontend::feature_vector_t feature_vector; + phi::autotune::BuildFeatureVector(&feature_vector, dim_x, fuse_dual); + + if (plan_cache.FindPlan(feature_vector, handle)) { + const cudnn_frontend::ExecutionPlan* cached_plan = nullptr; + int64_t workspace_size = 0; + plan_cache.GetPlanAndWorkspaceSize( + feature_vector, &cached_plan, &workspace_size, handle); + helper::ExecutePlan(handle, + &workspace_handle, + &data_ptrs, + &uids, + cached_plan->get_raw_desc(), + workspace_size); + return; + } + + auto plans = helper::FindExecutionPlans(&op_graph, + exhaustive_search, + deterministic, + &data_ptrs, + &uids, + handle, + &workspace_handle); + + helper::ExecutePlansAndCache(handle, + &workspace_handle, + &data_ptrs, + &uids, + &plans, + exhaustive_search, + feature_vector, + &plan_cache); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(fused_scale_bias_add_relu, + GPU, + ALL_LAYOUT, + phi::fusion::FusedScaleBiasAddReluKernel, + phi::dtype::float16) {} diff --git a/test/legacy_test/CMakeLists.txt b/test/legacy_test/CMakeLists.txt index fb6b02ce37b37..6e73892602043 100644 --- a/test/legacy_test/CMakeLists.txt +++ b/test/legacy_test/CMakeLists.txt @@ -505,6 +505,7 @@ endif() if(NOT WITH_CUDNN_FRONTEND) list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_relu_conv_bn_op) + list(REMOVE_ITEM TEST_OPS test_fused_scale_bias_add_relu_op) endif() # Some ops need to check results when gc is enabled diff --git a/test/legacy_test/test_fused_scale_bias_add_relu_op.py b/test/legacy_test/test_fused_scale_bias_add_relu_op.py new file mode 100644 index 0000000000000..44952e1ea23d1 --- /dev/null +++ b/test/legacy_test/test_fused_scale_bias_add_relu_op.py @@ -0,0 +1,130 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np +from op_test import OpTest, skip_check_grad_ci + +import paddle +from paddle.base import core + + +def skip_unit_test(): + return ( + not paddle.is_compiled_with_cuda() + or paddle.device.cuda.get_device_capability()[0] < 8 + ) + + +skip_msg = "only support with cuda and Ampere or later devices" + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(skip_unit_test(), skip_msg) +class TestFusedScaleBiasAddReluOp(OpTest): + def setUp(self): + self.op_type = "fused_scale_bias_add_relu" + self.dtype = np.float16 + self.outputs = None + + self.init_test_case() + self.init_attr() + + self.attrs = { + 'fuse_dual': self.fuse_dual, + 'exhaustive_search': self.exhaustive_search, + } + + c_dim = self.input_size[-1] + x1_input = np.random.random(self.input_size).astype(self.dtype) - 0.5 + x2_input = np.random.random(self.input_size).astype(self.dtype) - 0.5 + scale1_input = np.random.random(c_dim).astype(self.dtype) - 0.5 + scale2_input = np.random.random(c_dim).astype(self.dtype) - 0.5 + bias1_input = np.random.random(c_dim).astype(self.dtype) - 0.5 + bias2_input = np.random.random(c_dim).astype(self.dtype) - 0.5 + + # calculate reference output + reshaped_scale1_input = scale1_input.reshape(1, 1, 1, c_dim) + reshaped_scale2_input = scale2_input.reshape(1, 1, 1, c_dim) + reshaped_bias1_input = bias1_input.reshape(1, 1, 1, c_dim) + reshaped_bias2_input = bias2_input.reshape(1, 1, 1, c_dim) + + after_bias1 = x1_input * reshaped_scale1_input + reshaped_bias1_input + after_bias2 = x2_input * reshaped_scale2_input + reshaped_bias2_input + + if self.fuse_dual: + after_add = after_bias1 + after_bias2 + else: + after_add = after_bias1 + x2_input + + y_output = np.maximum(after_add, 0).astype(self.dtype) + + if self.fuse_dual: + self.inputs = { + 'x1': x1_input, + 'scale1': scale1_input, + 'bias1': bias1_input, + 'x2': x2_input, + 'scale2': scale2_input, + 'bias2': bias2_input, + } + else: + self.inputs = { + 'x1': x1_input, + 'scale1': scale1_input, + 'bias1': bias1_input, + 'x2': x2_input, + } + + self.outputs = { + 'y': y_output, + } + + def has_cuda(self): + return core.is_compiled_with_cuda() + + def test_check_output(self): + if self.has_cuda(): + place = core.CUDAPlace(0) + self.check_output_with_place(place, check_dygraph=False, atol=2e-2) + + def init_test_case(self): + self.input_size = [2, 8, 8, 16] + + def init_attr(self): + self.fuse_dual = False + self.exhaustive_search = False + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(skip_unit_test(), skip_msg) +class TestFusedScaleBiasAddReluOpDual(TestFusedScaleBiasAddReluOp): + def init_attr(self): + self.fuse_dual = True + self.exhaustive_search = False + + +@skip_check_grad_ci(reason="no grap op") +@unittest.skipIf(skip_unit_test(), skip_msg) +class TestFusedScaleBiasAddReluOpExhaustive(TestFusedScaleBiasAddReluOp): + def init_attr(self): + self.fuse_dual = False + self.exhaustive_search = True + + +if __name__ == '__main__': + np.random.seed(0) + unittest.main() diff --git a/test/white_list/op_accuracy_white_list.py b/test/white_list/op_accuracy_white_list.py index ce0e7dde85fe8..5ad871e071ba4 100644 --- a/test/white_list/op_accuracy_white_list.py +++ b/test/white_list/op_accuracy_white_list.py @@ -88,6 +88,7 @@ NO_FP16_COMPARED_WITH_FP32_OP_LIST = [ 'fake_quantize_moving_average_abs_max', 'fused_scale_bias_relu_conv_bn', + 'fused_scale_bias_add_relu', 'p_norm', ] diff --git a/tools/gpups_test.sh b/tools/gpups_test.sh index 982e850deeb13..6221a4b4f90e1 100644 --- a/tools/gpups_test.sh +++ b/tools/gpups_test.sh @@ -96,6 +96,7 @@ parallel_list="^init_phi_test$|\ ^test_fused_residual_dropout_bias$|\ ^test_fused_rotary_position_embedding$|\ ^test_fused_scale_bias_relu_conv_bn_op$|\ +^test_fused_scale_bias_add_relu_op$|\ ^test_fused_token_prune_op$|\ ^test_fused_transformer_encoder_layer$|\ ^test_fused_transformer_with_amp_decorator$|\