diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 23d5b0de247223..6aa049b106b8e3 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -145,6 +145,7 @@ if(WITH_TENSORRT) pass_library(elementwise_groupnorm_act_pass inference) pass_library(preln_elementwise_groupnorm_act_pass inference) pass_library(groupnorm_act_pass inference) + pass_library(trans_layernorm_fuse_pass inference) pass_library(trt_embedding_eltwise_layernorm_fuse_pass inference) pass_library(preln_embedding_eltwise_layernorm_fuse_pass inference) endif() diff --git a/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.cc new file mode 100644 index 00000000000000..6e0919a66535ff --- /dev/null +++ b/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/framework/ir/trans_layernorm_fuse_pass.h" +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace framework { +namespace ir { +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { +struct TransLayernormPattern : public PatternBase { + TransLayernormPattern(PDPattern *pattern, const std::string &name_scope) + : PatternBase(pattern, name_scope, "trans_layernorm") {} + void operator()(PDNode *x); + PATTERN_DECL_NODE(transpose); + PATTERN_DECL_NODE(transpose_output); + PATTERN_DECL_NODE(reshape); + PATTERN_DECL_NODE(reshape_output); + PATTERN_DECL_NODE(layernorm); + PATTERN_DECL_NODE(layernorm_scale); + PATTERN_DECL_NODE(layernorm_bias); + PATTERN_DECL_NODE(layernorm_output); +}; +void TransLayernormPattern::operator()(PDNode *x) { + std::unordered_set reshape_ops{"reshape2", + "flatten_contiguous_range"}; + auto *transpose = + pattern->NewNode(transpose_repr())->assert_is_op("transpose2"); + auto *transpose_output = pattern->NewNode(transpose_output_repr()) + ->assert_is_op_output("transpose2") + ->assert_is_ops_input(reshape_ops, "X"); + transpose->LinksFrom({x}).LinksTo({transpose_output}); + auto *reshape = pattern->NewNode(reshape_repr())->assert_is_ops(reshape_ops); + auto *reshape_output = pattern->NewNode(reshape_output_repr()) + ->assert_is_ops_output(reshape_ops, "Out") + ->assert_is_op_input("layer_norm", "X") + ->AsOutput(); + reshape->LinksFrom({transpose_output}).LinksTo({reshape_output}); + auto *layernorm = + pattern->NewNode(layernorm_repr())->assert_is_op("layer_norm"); + auto *layernorm_scale = pattern->NewNode(layernorm_scale_repr()) + ->assert_is_op_input("layer_norm", "Scale") + ->AsInput(); + auto *layernorm_bias = pattern->NewNode(layernorm_bias_repr()) + ->assert_is_op_input("layer_norm", "Bias") + ->AsInput(); + auto *layernorm_output = pattern->NewNode(layernorm_output_repr()) + ->assert_is_op_output("layer_norm", "Y") + ->AsOutput(); + layernorm->LinksFrom({reshape_output, layernorm_scale, layernorm_bias}) + .LinksTo({layernorm_output}); +} +} // namespace patterns + +// this pass make a fusion as below: +// +// | +// transpose(axis= [0,2,3,1]) +// | +// reshape(n,h*w,c) +// | | +// out layernorm(begin_norm_axis=2 or -1) +// | +// layernorm_out +// +// ->fuse to +// +// | +// trans_layernorm +// | | +// out layernorm_out +// + +int TransLayernormFusePass::ApplyConvTransLayernormPattern( + ir::Graph *graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + FusePassBase::Init("trans_layernorm_fuse", graph); + int found_subgraph_count = 0; + GraphPatternDetector gpd; + + PDNode *x = nullptr; + x = gpd.mutable_pattern() + ->NewNode("trans_layernorm_fuse/x") + ->AsInput() + ->assert_var_not_persistable() + ->assert_is_op_input("transpose2", "X"); + patterns::TransLayernormPattern fused_pattern(gpd.mutable_pattern(), + "trans_layernorm_fuse"); + fused_pattern(x); + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *graph) { + if (subgraph.count(x) <= 0) { + LOG(WARNING) << "The subgraph is empty."; + return; + } + VLOG(4) << "handle transpose layernorm fuse"; + GET_IR_NODE_FROM_SUBGRAPH(transpose, transpose, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + transpose_output, transpose_output, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape, reshape, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(reshape_output, reshape_output, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layernorm, layernorm, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layernorm_scale, layernorm_scale, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH(layernorm_bias, layernorm_bias, fused_pattern); + GET_IR_NODE_FROM_SUBGRAPH( + layernorm_output, layernorm_output, fused_pattern); + + if (!IsCompat(subgraph, graph)) { + LOG(WARNING) << "transpose layernorm pass in op compat failed."; + return; + } + // trans_layernorm is suit for nchw-to-nhwc transpose before layernorm + // check for it + std::vector trans_axis = + PADDLE_GET_CONST(std::vector, transpose->Op()->GetAttr("axis")); + if (trans_axis != std::vector{0, 2, 3, 1}) { + VLOG(1) << "transpose layernorm fuse pass, transpose axis check fail, " + "stop fusion"; + return; + } + if (reshape->Op()->Type() == "flatten_contiguous_range") { + int start_axis = + PADDLE_GET_CONST(int, reshape->Op()->GetAttr("start_axis")); + int stop_axis = + PADDLE_GET_CONST(int, reshape->Op()->GetAttr("stop_axis")); + if (!(start_axis == 1 && stop_axis == 2)) { + VLOG(1) << "transpose layernorm fuse pass, flatten axis check fail, " + "stop fusion"; + return; + } + } else if (reshape->Op()->Type() == "reshape2") { + std::vector reshape_shape = + PADDLE_GET_CONST(std::vector, reshape->Op()->GetAttr("shape")); + if (reshape_shape.size() != 3) { + VLOG(1) + << "transpose layernorm fuse pass, reshape check fail, stop fusion"; + return; + } + } + auto layernorm_begin_norm_axis = + PADDLE_GET_CONST(int, layernorm->Op()->GetAttr("begin_norm_axis")); + if (layernorm_begin_norm_axis != 2 && layernorm_begin_norm_axis != -1) { + VLOG(1) << "transpose layernorm fuse pass, layernorm begin norm axis " + "check fail, stop fusion"; + return; + } + std::unordered_set del_node_set; + // Create an preln_groupnorm_act op node + OpDesc new_desc(*layernorm->Op()); + new_desc.SetType("trans_layernorm"); + new_desc.SetInput("X", {subgraph.at(x)->Name()}); + new_desc.SetOutput("Out_reshape", {reshape_output->Name()}); + new_desc.SetOutput("Out_layernorm", {layernorm_output->Name()}); + new_desc.RemoveOutput("Y"); + new_desc.Flush(); + auto fused_node = graph->CreateOpNode(&new_desc); // OpDesc will be copied. + del_node_set.insert(transpose); + del_node_set.insert(transpose_output); + del_node_set.insert(reshape); + del_node_set.insert(layernorm); + GraphSafeRemoveNodes(graph, del_node_set); + + IR_NODE_LINK_TO(subgraph.at(x), fused_node); + IR_NODE_LINK_TO(layernorm_scale, fused_node); + IR_NODE_LINK_TO(layernorm_bias, fused_node); + IR_NODE_LINK_TO(fused_node, reshape_output); + IR_NODE_LINK_TO(fused_node, layernorm_output); + found_subgraph_count++; + }; + gpd(graph, handler); + return found_subgraph_count; +} + +void TransLayernormFusePass::ApplyImpl(ir::Graph *graph) const { + FusePassBase::Init("trans_layernorm_fuse_pass", graph); + int found_subgraph_count = ApplyConvTransLayernormPattern(graph); + AddStatis(found_subgraph_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(trans_layernorm_fuse_pass, + paddle::framework::ir::TransLayernormFusePass); diff --git a/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.h b/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.h new file mode 100644 index 00000000000000..e68032bc4b205f --- /dev/null +++ b/paddle/fluid/framework/ir/trans_layernorm_fuse_pass.h @@ -0,0 +1,132 @@ +/* Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { +// This pass aim to fuse below structure +// +// | +// transpose(axis= [0,2,3,1]) +// | +// reshape(n,h*w,c) +// | | +// out layernorm(begin_norm_axis=2 or -1) +// | +// layernorm_out +// +// ->fuse to +// +// | +// trans_layernorm +// | | +// out layernorm_out +// +class Graph; + +class TransLayernormFusePass : public FusePassBase { + public: + TransLayernormFusePass() { + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); + AddOpCompat(OpCompat("flatten_contiguous_range")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("start_axis") + .IsNumEQ(1) + .End() + .AddAttr("stop_axis") + .IsNumEQ(2) + .End(); + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsOptional() + .IsTensor() + .End() + .AddAttr("axis") // {0, 2, 1, 3} + .IsType>() + .End(); + AddOpCompat(OpCompat("layer_norm")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Scale") + .IsTensor() + .End() + .AddInput("Bias") + .IsTensor() + .End() + .AddOutput("Y") + .IsTensor() + .End() + .AddOutput("Mean") + .IsTensor() + .End() + .AddOutput("Variance") + .IsTensor() + .End() + .AddAttr("begin_norm_axis") + .IsNumGT(0) + .End() + .AddAttr("epsilon") + .IsNumGE(0.0f) + .IsNumLE(1.0f) + .End(); + } + virtual ~TransLayernormFusePass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + int ApplyConvTransLayernormPattern(ir::Graph* graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index bd49153f6b85e9..296607b6ec605d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -2490,6 +2490,7 @@ USE_TRT_CONVERTER(layernorm_shift_partition) USE_TRT_CONVERTER(reverse_roll) USE_TRT_CONVERTER(preln_layernorm_shift_partition) USE_TRT_CONVERTER(merge_layernorm) +USE_TRT_CONVERTER(trans_layernorm) USE_TRT_CONVERTER(skip_merge_layernorm) USE_TRT_CONVERTER(generic_plugin_creater) USE_TRT_CONVERTER(custom_plugin_creater) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index b5582518eacd21..edb57382a704b1 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -113,20 +113,24 @@ const std::vector kTRTSubgraphPasses({ "trt_skip_layernorm_fuse_pass", // "preln_skip_layernorm_fuse_pass", // #endif - "layernorm_shift_partition_fuse_pass", // - "merge_layernorm_fuse_pass", // - "preln_residual_bias_fuse_pass", // - "preln_layernorm_x_fuse_pass", // - "reverse_roll_fuse_pass", // - "conv_bn_fuse_pass", // - "unsqueeze2_eltwise_fuse_pass", // - "trt_squeeze2_matmul_fuse_pass", // - "trt_flatten2_matmul_fuse_pass", // - "trt_map_matmul_v2_to_mul_pass", // - "trt_map_matmul_v2_to_matmul_pass", // - "trt_map_matmul_to_mul_pass", // - "fc_fuse_pass", // - "conv_elementwise_add_fuse_pass", // + "layernorm_shift_partition_fuse_pass", // + "merge_layernorm_fuse_pass", // + "preln_residual_bias_fuse_pass", // + "preln_layernorm_x_fuse_pass", // + "reverse_roll_fuse_pass", // + "conv_bn_fuse_pass", // + "unsqueeze2_eltwise_fuse_pass", // + "trt_squeeze2_matmul_fuse_pass", // + "trt_flatten2_matmul_fuse_pass", // + "trt_map_matmul_v2_to_mul_pass", // + "trt_map_matmul_v2_to_matmul_pass", // + "trt_map_matmul_to_mul_pass", // + "fc_fuse_pass", // + "conv_elementwise_add_fuse_pass", // +#if defined _WIN32 // Windows CI is TensorRT7.0. Remove this after upgrading. +#else + "trans_layernorm_fuse_pass", // +#endif "remove_padding_recover_padding_pass", // "delete_remove_padding_recover_padding_pass", // // "yolo_box_fuse_pass", // diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 7caa8765b8e235..c0e4ba79f50463 100755 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -92,6 +92,7 @@ list( take_along_axis_op.cc logsigmoid_op.cc preln_layernorm_shift_partition_op.cc + trans_layernorm_op.cc merge_layernorm_op.cc skip_merge_layernorm_op.cc generic_and_custom_plugin_creater.cc diff --git a/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc b/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc new file mode 100644 index 00000000000000..06b497aa4c9094 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/trans_layernorm_op.cc @@ -0,0 +1,90 @@ +/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h" +namespace paddle { +namespace inference { +namespace tensorrt { + +class TransLayerNormOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, + bool test_mode) override { + VLOG(4) << "convert a trans_layer_norm fused op to tensorrt " + "trans_layernorm plugin"; + framework::OpDesc op_desc(op, nullptr); + + auto* X = engine_->GetITensor(op_desc.Input("X").front()); + auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); + auto* Scale_v = scope.FindVar(op_desc.Input("Scale").front()); + // we already check the begin_norm_axis in pass action. + // here we set begin_norm_axis as 3 to fit the calculation in trt plugin. + const int begin_norm_axis = 3; + const float eps = op_desc.HasAttr("epsilon") + ? PADDLE_GET_CONST(float, op_desc.GetAttr("epsilon")) + : 1e-5f; + PADDLE_ENFORCE_NOT_NULL( + Bias_v, + platform::errors::InvalidArgument( + "Input(Bias) of layer_norm should not be null.")); + PADDLE_ENFORCE_NOT_NULL( + Scale_v, + platform::errors::InvalidArgument( + "Input(Scale) of layer_norm should not be null.")); + + auto* Bias_t = Bias_v->GetMutable(); + auto* Scale_t = Scale_v->GetMutable(); + + auto bias_weight = + engine_->GetFp32TrtWeight(op_desc.Input("Bias").front(), *Bias_t); + auto scale_weight = + engine_->GetFp32TrtWeight(op_desc.Input("Scale").front(), *Scale_t); + + nvinfer1::ILayer* layernorm_layer = nullptr; + if (engine_->with_dynamic_shape()) { + // For dynamic shape, + // the shape of mean and variance will be determine in configuPlugin. + std::vector mean_shape{1}; + std::vector variance_shape{1}; + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::TransLayerNormPluginDynamic* plugin = + new plugin::TransLayerNormPluginDynamic( + static_cast(bias_weight.get().values), + bias_weight.get().count, + static_cast(scale_weight.get().values), + scale_weight.get().count, + begin_norm_axis, + eps, + mean_shape, + variance_shape, + with_fp16); + layernorm_layer = engine_->AddDynamicPlugin(&X, 1, plugin); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "trans_layernorm do not support static shape mode yet")); + } + + auto output_layernorm_name = op_desc.Output("Out_layernorm").front(); + auto output_reshape_name = op_desc.Output("Out_reshape").front(); + RreplenishLayerAndOutput(layernorm_layer, + "trans_layernorm", + {output_layernorm_name, output_reshape_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(trans_layernorm, TransLayerNormOpConverter); diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index e9c34408bb6bfd..50e223eaafe702 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -2461,7 +2461,13 @@ struct SimpleOpTypeSetTeller : public Teller { return false; } } - + if (op_type == "trans_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "The trans_layernorm op does not support " + "static shape yet"; + return false; + } + } if (op_type == "lookup_table") { if (!with_dynamic_shape) { VLOG(3) << "the lookup_table does not support " @@ -2631,6 +2637,7 @@ struct SimpleOpTypeSetTeller : public Teller { "logsigmoid", "preln_layernorm_shift_partition", "lookup_table", + "trans_layernorm", "merge_layernorm", "skip_merge_layernorm", "lookup_table_v2", @@ -2780,6 +2787,7 @@ struct SimpleOpTypeSetTeller : public Teller { "take_along_axis", "logsigmoid", "preln_layernorm_shift_partition", + "trans_layernorm", "merge_layernorm", "skip_merge_layernorm", "lookup_table", diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index b0d2eb6d347466..e11252b2208ac5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -33,6 +33,7 @@ list( layernorm_shift_partition_op.cu reverse_roll_op_plugin.cu prelnlayernorm_shift_partition_op.cu + trans_layernorm_op_plugin.cu merge_layernorm_op_plugin.cu skip_merge_layernorm_op_plugin.cu skip_groupnorm_act_op_plugin.cu diff --git a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu index 945401c9404fd5..0307fbb1dd3509 100644 --- a/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/preln_residual_bias_plugin.cu @@ -26,12 +26,21 @@ #include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" #include "paddle/fluid/operators/layer_norm_kernel.cu.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h" +#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/kernels/funcs/math_cuda_utils.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { + +inline int getSMVersion() { + const int device = phi::backends::gpu::GetCurrentDeviceId(); + const phi::gpuDeviceProp prop = + phi::backends::gpu::GetDeviceProperties(device); + return prop.major * 10 + prop.minor; +} + #ifdef TRT_PLUGIN_FP16_AVALIABLE #define FINAL_MASK 0xffffffff @@ -425,35 +434,65 @@ int PrelnResidualBiasPluginDynamic::enqueue( float *mean = nullptr; float *var = nullptr; const int VecSize = 8; -#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - // if hidden is even, use half2 kernel generalAddBiasResidualLayerNormOpt2 - if (hidden % 2 == 0) { - int half_n = hidden / 2; - int half_n_32 = (half_n + 31) / 32 * 32; - dim3 block(std::min(half_n_32, 512)); - int rolls_per_thread = half_n / block.x; - int unroll_factor = 8; - while (unroll_factor > rolls_per_thread && unroll_factor > 1) { - unroll_factor /= 2; - } - switch (unroll_factor) { - case 1: - HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(1); - break; - case 2: - HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(2); - break; - case 4: - HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(4); - break; - case 8: - HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(8); - break; - default: - PADDLE_THROW(platform::errors::Fatal( - "Invalid UNROLL_FACTOR in preln_residual_bias trt plugin.")); + + int sm = getSMVersion(); + // sm >= 60 to support _ldg + if (sm >= 60) { + // if hidden is even, use half2 kernel generalAddBiasResidualLayerNormOpt2 + if (hidden % 2 == 0) { + int half_n = hidden / 2; + int half_n_32 = (half_n + 31) / 32 * 32; + dim3 block(std::min(half_n_32, 512)); + int rolls_per_thread = half_n / block.x; + int unroll_factor = 8; + while (unroll_factor > rolls_per_thread && unroll_factor > 1) { + unroll_factor /= 2; + } + switch (unroll_factor) { + case 1: + HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(1); + break; + case 2: + HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(2); + break; + case 4: + HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(4); + break; + case 8: + HALF2_ADD_BIAS_RESIDUAL_LAYERNORM_OPT2(8); + break; + default: + PADDLE_THROW(platform::errors::Fatal( + "Invalid UNROLL_FACTOR in preln_residual_bias trt plugin.")); + } + } else { + paddle::operators::FusedLayernormResidualDropoutBiasFunctor()( + rows, + cols, + seed, + dropout_prob, + is_upscale_in_train, + is_test, + increment, + epsilon, + src, + residual, + bias, + scale, + layernorm_bias, + mask_data, + dst, + layernorm_dst, + mean, + var, + stream); } } else { + // if sm < 60, use FusedLayernormResidualDropoutBiasFunctor only paddle::operators::FusedLayernormResidualDropoutBiasFunctor()( - rows, - cols, - seed, - dropout_prob, - is_upscale_in_train, - is_test, - increment, - epsilon, - src, - residual, - bias, - scale, - layernorm_bias, - mask_data, - dst, - layernorm_dst, - mean, - var, - stream); -#endif #else PADDLE_THROW(platform::errors::Fatal( "The Ernie(Bert) tensorRT plugin should be " diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu new file mode 100644 index 00000000000000..467c9737744dbf --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.cu @@ -0,0 +1,555 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h" +#include "paddle/phi/kernels/funcs/math_cuda_utils.h" +#include "paddle/phi/kernels/layer_norm_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +inline int getSMVersion() { + const int device = phi::backends::gpu::GetCurrentDeviceId(); + const phi::gpuDeviceProp prop = + phi::backends::gpu::GetDeviceProperties(device); + return prop.major * 10 + prop.minor; +} + +#ifdef TRT_PLUGIN_FP16_AVALIABLE +#define FINAL_MASK 0xffffffff + +template +__global__ void GeneralResidualLayerNormOpt2(half2 *normed_output, + half2 *output, + const half2 *__restrict src, + const half2 *__restrict gamma, + const half2 *__restrict beta, + int m, + int n, + float epsilon) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + __shared__ float s_mean; + __shared__ float s_variance; + float x_sum = 0.0f; + float x2_sum = 0.0f; + const int b_offset = blockIdx.x * n; + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = b_offset + i; + float val_1 = 0.0f; + float val_2 = 0.0f; + half2 tmp; + tmp = __ldg(&src[index]); + val_1 += static_cast(tmp.x); + val_2 += static_cast(tmp.y); + output[index] = tmp; + x_sum += val_1 + val_2; + x2_sum += val_1 * val_1 + val_2 * val_2; + } + float sums[2]; + sums[0] = x_sum; + sums[1] = x2_sum; + phi::funcs::BlockReduceSumV2(sums); + constexpr int Half2VecSize = 2; + if (threadIdx.x == 0) { + s_mean = sums[0] / n / Half2VecSize; + s_variance = rsqrtf(sums[1] / n / Half2VecSize - s_mean * s_mean + epsilon); + } + __syncthreads(); + + half2 mean_2 = __float2half2_rn(s_mean); + half2 var_2 = __float2half2_rn(s_variance); + +#pragma unroll UNROLL_FACTOR + for (int i = threadIdx.x; i < n; i += blockDim.x) { + const int index = b_offset + i; + half2 val = __hmul2(__hmul2(__hsub2(output[index], mean_2), var_2), + __ldg(&gamma[i])); + if (beta) { + val = __hadd2(val, __ldg(&beta[i])); + } + normed_output[index] = val; + } +#endif +} + +#define HALF2_RESIDUAL_LAYERNORM_OPT2(UNROLL_FACTOR) \ + GeneralResidualLayerNormOpt2 \ + <<>>(reinterpret_cast(layernorm_dst), \ + reinterpret_cast(dst), \ + (const half2 *)input, \ + (const half2 *)fp16_scale_gpu_, \ + (const half2 *)fp16_bias_gpu_, \ + rows, \ + half_n, \ + eps); + +#endif + +int TransLayerNormPluginDynamic::initialize() TRT_NOEXCEPT { + if (!with_fp16_) { + cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size()); + cudaMemcpy(bias_gpu_, + bias_.data(), + bias_.size() * sizeof(float), + cudaMemcpyHostToDevice); + cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size()); + cudaMemcpy(scale_gpu_, + scale_.data(), + scale_.size() * sizeof(float), + cudaMemcpyHostToDevice); + } else { + cudaMalloc(&bias_gpu_, sizeof(float) * bias_.size()); + cudaMemcpy(bias_gpu_, + bias_.data(), + bias_.size() * sizeof(float), + cudaMemcpyHostToDevice); + cudaMalloc(&scale_gpu_, sizeof(float) * scale_.size()); + cudaMemcpy(scale_gpu_, + scale_.data(), + scale_.size() * sizeof(float), + cudaMemcpyHostToDevice); + std::vector fp16_bias_; + std::vector fp16_scale_; + fp16_bias_.resize(bias_.size()); + fp16_scale_.resize(scale_.size()); + for (int i = 0; i < bias_.size(); i++) { + fp16_bias_[i] = static_cast(bias_[i]); + } + for (int i = 0; i < scale_.size(); i++) { + fp16_scale_[i] = static_cast(scale_[i]); + } + + cudaMalloc(&fp16_bias_gpu_, sizeof(half) * fp16_bias_.size()); + cudaMemcpy(fp16_bias_gpu_, + fp16_bias_.data(), + fp16_bias_.size() * sizeof(half), + cudaMemcpyHostToDevice); + cudaMalloc(&fp16_scale_gpu_, sizeof(half) * fp16_scale_.size()); + cudaMemcpy(fp16_scale_gpu_, + fp16_scale_.data(), + fp16_scale_.size() * sizeof(half), + cudaMemcpyHostToDevice); + } + return 0; +} + +void TransLayerNormPluginDynamic::terminate() TRT_NOEXCEPT { + if (bias_gpu_) { + cudaFree(bias_gpu_); + bias_gpu_ = nullptr; + } + if (scale_gpu_) { + cudaFree(scale_gpu_); + scale_gpu_ = nullptr; + } + if (fp16_bias_gpu_) { + cudaFree(fp16_bias_gpu_); + fp16_bias_gpu_ = nullptr; + } + if (fp16_scale_gpu_) { + cudaFree(fp16_scale_gpu_); + fp16_scale_gpu_ = nullptr; + } +} + +nvinfer1::DimsExprs TransLayerNormPluginDynamic::getOutputDimensions( + int output_index, + const nvinfer1::DimsExprs *inputDims, + int nb_inputs, + nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT { + nvinfer1::DimsExprs ret; + ret.nbDims = 3; + ret.d[0] = inputDims[0].d[0]; + ret.d[1] = expr_builder.operation(nvinfer1::DimensionOperation::kPROD, + *inputDims[0].d[2], + *inputDims[0].d[3]); + ret.d[2] = inputDims[0].d[1]; + return ret; +} + +bool TransLayerNormPluginDynamic::supportsFormatCombination( + int pos, + const nvinfer1::PluginTensorDesc *in_out, + int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + int feature_size = bias_.size(); + PADDLE_ENFORCE_GE( + feature_size, + 0, + platform::errors::InvalidArgument( + "The feature size of layernorm feature_size must be positive," + "but got:%d", + feature_size)); + + PADDLE_ENFORCE_NOT_NULL( + in_out, + platform::errors::InvalidArgument( + "The input of layernorm plugin shoule not be nullptr.")); + PADDLE_ENFORCE_LT( + pos, + nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, + nb_inputs + nb_outputs)); + const nvinfer1::PluginTensorDesc &in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { + if (feature_size % 8 == 0) { + // now, we only support khwc8 for feature_size % 8 == 0 + return ((in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::PluginFormat::kLINEAR || + in.format == nvinfer1::PluginFormat::kHWC8)); + } else { + return ((in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::PluginFormat::kLINEAR)); + } + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } + if (pos == 1) { + if (with_fp16_) { + return (in.type == in_out[0].type && + (in.format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return (in.type == in_out[0].type) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } + if (pos == 2) { + if (with_fp16_) { + return (in.type == in_out[0].type && + (in.format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return (in.type == in_out[0].type) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } +} + +void TransLayerNormPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc *in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, + int nbOutputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ( + begin_norm_axis_, + 3, + platform::errors::InvalidArgument( + "The transpose_LayerNorm Plugin only has begin_norm_axis_ = 3" + "but get %d.", + begin_norm_axis_)); + const auto &input_dims = in[0].desc.dims; + int statis_num = input_dims.d[0] * input_dims.d[2] * input_dims.d[3]; + mean_shape_[0] = statis_num; + variance_shape_[0] = statis_num; +} + +nvinfer1::DataType TransLayerNormPluginDynamic::getOutputDataType( + int index, + const nvinfer1::DataType *input_types, + int nb_inputs) const TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ( + nb_inputs, + 1, + platform::errors::InvalidArgument( + "The transpose_LayerNorm Plugin only has one input, so the " + "nb_inputs value should be 1, but get %d.", + nb_inputs)); + PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT || + input_types[0] == nvinfer1::DataType::kHALF), + true, + platform::errors::InvalidArgument( + "The input type should be half or float")); + return input_types[0]; +} + +int TransLayerNormPluginDynamic::enqueue( + const nvinfer1::PluginTensorDesc *input_desc, + const nvinfer1::PluginTensorDesc *output_desc, + const void *const *inputs, + void *const *outputs, + void *workspace, + cudaStream_t stream) TRT_NOEXCEPT { + const auto &input_dims = input_desc[0].dims; + int begin_norm_axis = begin_norm_axis_; + float eps = eps_; + + std::vector input_shape; + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + int input_numel = 1; + for (int i = 0; i < input_dims.nbDims; i++) { + input_numel *= input_dims.d[i]; + } + PADDLE_ENFORCE_EQ(1, + mean_shape_.size(), + platform::errors::InvalidArgument( + "Size of mean_shape vector should be equal to 1," + "but got Size of mean_shape vector:%d", + mean_shape_.size())); + PADDLE_ENFORCE_EQ(1, + variance_shape_.size(), + platform::errors::InvalidArgument( + "Size of variance_shape vector should be equal to 1," + "but got Size of mean_shape vector:%d", + mean_shape_.size())); + PADDLE_ENFORCE_GE(mean_shape_[0], + 0, + platform::errors::InvalidArgument( + "The size of mean vector should be positive," + "but got:%d", + mean_shape_[0])); + PADDLE_ENFORCE_GE(variance_shape_[0], + 0, + platform::errors::InvalidArgument( + "The size of mean vector should be positive," + "but got:%d", + variance_shape_[0])); + + // transpose do not change numel + int trans_result_numel = input_numel; + std::vector trans_result_shape{ + input_shape[0], input_shape[2], input_shape[3], input_shape[1]}; + + const auto input_ddim = phi::make_ddim(input_shape); + int feature_size = static_cast(input_ddim[1]); + PADDLE_ENFORCE_EQ(feature_size, + scale_.size(), + platform::errors::InvalidArgument( + "scale's size should be equal to the feature_size," + "but got feature_size:%d, scale's size:%d.", + feature_size, + scale_.size())); + PADDLE_ENFORCE_EQ(feature_size, + bias_.size(), + platform::errors::InvalidArgument( + "bias's size should be equal to the feature_size," + "but got feature_size:%d, bias's size:%d.", + feature_size, + bias_.size())); + + int device_id = -1; + cudaGetDevice(&device_id); + PADDLE_ENFORCE_GE( + device_id, + 0, + platform::errors::InvalidArgument("device_id should be positive," + "but got:%d", + device_id)); + + mean_t.Resize(phi::make_ddim(mean_shape_)); + variance_t.Resize(phi::make_ddim(variance_shape_)); + float *mean_d = mean_t.mutable_data(platform::CUDAPlace(device_id)); + float *variance_d = + variance_t.mutable_data(platform::CUDAPlace(device_id)); + auto input_type = input_desc[0].type; + + paddle::platform::DeviceContextPool &pool = + paddle::platform::DeviceContextPool::Instance(); + platform::CUDAPlace place(platform::GetCurrentDeviceId()); + auto *device_context = static_cast(pool.Get(place)); + const phi::GPUContext &dev_ctx = *device_context; + + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. trans_layernorm-->fp32"; + VLOG(1) << "TRT Plugin format selected. trans_layernorm-->kLINEAR"; + const float *input = reinterpret_cast(inputs[0]); + float *layernorm_dst = static_cast(outputs[0]); + float *dst = static_cast(outputs[1]); + // transpose and norm do not change numel + int trans_result_numel = input_numel; + int norm_result_numel = input_numel; + phi::DenseTensorMeta input_meta(phi::DataType::FLOAT32, + phi::make_ddim(input_shape)); + phi::DenseTensorMeta bias_meta(phi::DataType::FLOAT32, + phi::make_ddim({feature_size})); + phi::DenseTensorMeta scale_meta(phi::DataType::FLOAT32, + phi::make_ddim({feature_size})); + phi::DenseTensorMeta trans_result_meta(phi::DataType::FLOAT32, + phi::make_ddim(trans_result_shape)); + phi::DenseTensorMeta norm_result_meta(phi::DataType::FLOAT32, + phi::make_ddim(trans_result_shape)); + std::shared_ptr input_alloc(new phi::Allocation( + static_cast(const_cast(input)), // NOLINT + input_numel * sizeof(float), + place)); + std::shared_ptr bias_alloc( + new phi::Allocation(static_cast(bias_gpu_), // NOLINT + feature_size * sizeof(float), + place)); + std::shared_ptr scale_alloc(new phi::Allocation( + static_cast(scale_gpu_), feature_size * sizeof(float), place)); + std::shared_ptr trans_result_alloc( + new phi::Allocation(static_cast(dst), // NOLINT + trans_result_numel * sizeof(float), + place)); + std::shared_ptr norm_result_alloc( + new phi::Allocation(static_cast(layernorm_dst), // NOLINT + norm_result_numel * sizeof(float), + place)); + + const phi::DenseTensor input_tensor = + phi::DenseTensor(input_alloc, input_meta); + phi::DenseTensor bias_tensor = phi::DenseTensor(bias_alloc, bias_meta); + phi::DenseTensor scale_tensor = phi::DenseTensor(scale_alloc, scale_meta); + phi::DenseTensor trans_result_tensor = + phi::DenseTensor(trans_result_alloc, trans_result_meta); + phi::DenseTensor norm_result_tensor = + phi::DenseTensor(norm_result_alloc, norm_result_meta); + + phi::TransposeKernel(dev_ctx, + input_tensor, + std::vector{0, 2, 3, 1}, + &trans_result_tensor); + phi::LayerNormKernel(dev_ctx, + trans_result_tensor, + scale_tensor, + bias_tensor, + eps, + begin_norm_axis, + &norm_result_tensor, + &mean_t, + &variance_t); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. trans_layernorm-->fp16"; + const half *input = reinterpret_cast(inputs[0]); + half *layernorm_dst = static_cast(outputs[0]); + half *dst = static_cast(outputs[1]); + if (input_desc[0].format == nvinfer1::PluginFormat::kLINEAR) { + VLOG(1) << "TRT Plugin format selected. trans_layernorm-->kLINEAR"; + phi::DenseTensorMeta input_meta(phi::DataType::FLOAT16, + phi::make_ddim(input_shape)); + std::shared_ptr input_alloc(new phi::Allocation( + static_cast(const_cast(input)), // NOLINT + input_numel * sizeof(half), + place)); + phi::DenseTensorMeta trans_result_meta( + phi::DataType::FLOAT16, phi::make_ddim(trans_result_shape)); + std::shared_ptr trans_result_alloc( + new phi::Allocation(static_cast(dst), // NOLINT + trans_result_numel * sizeof(half), + place)); + const phi::DenseTensor input_tensor = + phi::DenseTensor(input_alloc, input_meta); + phi::DenseTensor trans_result_tensor = + phi::DenseTensor(trans_result_alloc, trans_result_meta); + phi::TransposeKernel( + dev_ctx, + input_tensor, + std::vector{0, 2, 3, 1}, + &trans_result_tensor); + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + dst, + trans_result_shape, + bias_gpu_, + scale_gpu_, + layernorm_dst, + mean_d, + variance_d, + begin_norm_axis, + eps); + } else if (input_desc[0].format == nvinfer1::PluginFormat::kHWC8) { + VLOG(1) << "TRT Plugin format selected. trans_layernorm-->kHWC8"; + int sm = getSMVersion(); + // sm >= 60 to support __ldg + if (sm >= 60) { + int hidden = input_shape[1]; + if (hidden % 2 == 0) { + const size_t rows = + static_cast(input_shape[0] * input_shape[2] * + input_shape[3]); // batch * seq_length + int half_n = hidden / 2; + int half_n_32 = (half_n + 31) / 32 * 32; + dim3 block(std::min(half_n_32, 512)); + int rolls_per_thread = half_n / block.x; + int unroll_factor = 8; + while (unroll_factor > rolls_per_thread && unroll_factor > 1) { + unroll_factor /= 2; + } + switch (unroll_factor) { + case 1: + HALF2_RESIDUAL_LAYERNORM_OPT2(1); + break; + case 2: + HALF2_RESIDUAL_LAYERNORM_OPT2(2); + break; + case 4: + HALF2_RESIDUAL_LAYERNORM_OPT2(4); + break; + case 8: + HALF2_RESIDUAL_LAYERNORM_OPT2(8); + break; + default: + PADDLE_THROW(platform::errors::Fatal( + "Invalid UNROLL_FACTOR in transpose_layernorm trt plugin.")); + } + } else { + cudaMemcpyAsync( + dst, input, input_numel * sizeof(half), cudaMemcpyDeviceToDevice); + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + input, + trans_result_shape, + bias_gpu_, + scale_gpu_, + layernorm_dst, + mean_d, + variance_d, + begin_norm_axis, + eps); + } + } else { + cudaMemcpyAsync( + dst, input, input_numel * sizeof(half), cudaMemcpyDeviceToDevice); + phi::LayerNormDirectCUDAFunctor layer_norm; + layer_norm(stream, + input, + trans_result_shape, + bias_gpu_, + scale_gpu_, + layernorm_dst, + mean_d, + variance_d, + begin_norm_axis, + eps); + } + } + } else { + PADDLE_THROW( + platform::errors::Fatal("The TransLayerNormPluginDynamic TRT Plugin's " + "input type should be float or half.")); + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h new file mode 100644 index 00000000000000..6f150c33c0b90e --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trans_layernorm_op_plugin.h @@ -0,0 +1,176 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { +class TransLayerNormPluginDynamic : public DynamicPluginTensorRT { + public: + TransLayerNormPluginDynamic(const float* bias, + const int bias_num, + const float* scale, + const int scale_num, + int begin_norm_axis, + float eps, + std::vector mean_shape, + std::vector variance_shape, + bool with_fp16) + : begin_norm_axis_(begin_norm_axis), + eps_(eps), + mean_shape_(mean_shape), + variance_shape_(variance_shape) { + with_fp16_ = with_fp16; + bias_.resize(bias_num); + scale_.resize(scale_num); + std::copy(bias, bias + bias_num, bias_.data()); + std::copy(scale, scale + scale_num, scale_.data()); + } + + TransLayerNormPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &bias_); + DeserializeValue(&serialData, &serialLength, &scale_); + DeserializeValue(&serialData, &serialLength, &begin_norm_axis_); + DeserializeValue(&serialData, &serialLength, &eps_); + DeserializeValue(&serialData, &serialLength, &mean_shape_); + DeserializeValue(&serialData, &serialLength, &variance_shape_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); + } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + auto ptr = new TransLayerNormPluginDynamic(bias_.data(), + bias_.size(), + scale_.data(), + scale_.size(), + begin_norm_axis_, + eps_, + mean_shape_, + variance_shape_, + with_fp16_); + ptr->bias_gpu_ = bias_gpu_; + ptr->scale_gpu_ = scale_gpu_; + ptr->fp16_bias_gpu_ = fp16_bias_gpu_; + ptr->fp16_scale_gpu_ = fp16_scale_gpu_; + return ptr; + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "trans_layernorm_plugin_dynamic"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 2; } + int initialize() TRT_NOEXCEPT override; + void terminate() TRT_NOEXCEPT override; + + size_t getSerializationSize() const TRT_NOEXCEPT override { + return SerializedSize(bias_) + SerializedSize(scale_) + + SerializedSize(begin_norm_axis_) + SerializedSize(eps_) + + SerializedSize(mean_shape_) + SerializedSize(variance_shape_) + + SerializedSize(with_fp16_); + } + + void serialize(void* buffer) const TRT_NOEXCEPT override { + SerializeValue(&buffer, bias_); + SerializeValue(&buffer, scale_); + SerializeValue(&buffer, begin_norm_axis_); + SerializeValue(&buffer, eps_); + SerializeValue(&buffer, mean_shape_); + SerializeValue(&buffer, variance_shape_); + SerializeValue(&buffer, with_fp16_); + } + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, + const nvinfer1::DimsExprs* inputs, + int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) // NOLINT + TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, + void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType(int index, + const nvinfer1::DataType* inputTypes, + int nbInputs) const + TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + std::vector bias_; + std::vector scale_; + phi::DenseTensor mean_t; + phi::DenseTensor variance_t; + int begin_norm_axis_; + float eps_; + std::vector mean_shape_; + std::vector variance_shape_; + // data on devices + float* bias_gpu_{nullptr}; + float* scale_gpu_{nullptr}; + half* fp16_bias_gpu_{nullptr}; + half* fp16_scale_gpu_{nullptr}; +}; + +class TransLayerNormPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "trans_layernorm_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin(const char* name, + const void* serial_data, + size_t serial_length) + TRT_NOEXCEPT override { + return new TransLayerNormPluginDynamic(serial_data, serial_length); + } +}; + +REGISTER_TRT_PLUGIN_V2(TransLayerNormPluginDynamicCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt index 2dd35d10d5ff05..0cb7c8776e2515 100755 --- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt @@ -31,6 +31,9 @@ if(WIN32) list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_element_groupnorm_act_fuse_pass") list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_groupnorm_act_pass_fuse_pass") + list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_trans_layernorm") + list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_trans_layernorm") + list(REMOVE_ITEM TEST_INFERENCE_IR_PASSES "test_trt_convert_trans_layernorm") list(REMOVE_ITEM TEST_TRT_IR_PASSES "test_trt_convert_fused_token_prune") list(REMOVE_ITEM TEST_TRT_CONVERTER "test_trt_convert_fused_token_prune") endif() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_trans_layernorm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_trans_layernorm.py new file mode 100644 index 00000000000000..3d7236d2bba7df --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_trans_layernorm.py @@ -0,0 +1,249 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import unittest +from functools import partial +from typing import List + +import numpy as np +from program_config import ProgramConfig, TensorConfig +from trt_layer_auto_scan_test import TrtLayerAutoScanTest + +import paddle.inference as paddle_infer + + +class TrtConvertTransLayernormTest(TrtLayerAutoScanTest): + def is_program_valid(self, program_config: ProgramConfig) -> bool: + return True + + def sample_program_configs(self): + def conv_filter_datagen(dics): + c = dics["c"] + x = (np.random.randn(c, c, 1, 1)) / np.sqrt(c) + return x.astype(np.float32) + + def elementwise_bias_datagen(dics): + c = dics["c"] + x = np.random.random([c]) * 0.01 + return x.astype(np.float32) + + def layernorm_bias_datagen(dics): + c = dics["c"] + x = np.random.random([c]) * 0.1 + return x.astype(np.float32) + + def layernorm_scale_datagen(dics): + x = np.ones([c]) + return x.astype(np.float32) + + def conv2d_input_datagen(dics): + x = np.random.randn(dics["batch"], dics["c"], dics["h"], dics["w"]) + x = (x - np.mean(x)) / (np.std(x)) + return x.astype(np.float32) + + for batch in [2]: + for begin_norm_axis in [2]: + for h in [32, 64]: + for w in [32, 64]: + for c in [128, 320, 255, 133]: + for reshape in ["flatten", "reshape"]: + dics = { + "batch": batch, + "begin_norm_axis": begin_norm_axis, + "h": h, + "w": w, + "c": c, + "flatten": { + "op_type": "flatten_contiguous_range", + "op_inputs": { + "X": ["transpose2_out"], + }, + "op_outputs": { + "Out": ["reshape_out"], + }, + "op_attrs": { + "start_axis": 1, + "stop_axis": 2, + }, + }, + "reshape": { + "op_type": "reshape2", + "op_inputs": { + "X": ["transpose2_out"], + }, + "op_outputs": { + "Out": ["reshape_out"], + }, + "op_attrs": {"shape": [-1, h * w, c]}, + }, + } + ops_config = [ + { + "op_type": "conv2d", + "op_inputs": { + "Input": ["conv2d_input"], + "Filter": ["conv2d_filter"], + }, + "op_outputs": { + "Output": ["conv2d_output"], + }, + "op_attrs": { + "dilations": [1, 1], + "padding_algorithm": "EXPLICIT", + "groups": 1, + "paddings": [0, 0], + "strides": [1, 1], + "data_format": "NCHW", + }, + }, + { + "op_type": "elementwise_add", + "op_inputs": { + "X": ["conv2d_output"], + "Y": ["elementwise_bias"], + }, + "op_outputs": { + "Out": ["elementwise_out"] + }, + "op_attrs": {"axis": 1}, + }, + { + "op_type": "transpose2", + "op_inputs": { + "X": ["elementwise_out"], + }, + "op_outputs": { + "Out": ["transpose2_out"], + }, + "op_attrs": {"axis": [0, 2, 3, 1]}, + }, + dics[reshape], + { + "op_type": "layer_norm", + "op_inputs": { + "X": ["reshape_out"], + "Bias": ["layernorm_bias"], + "Scale": ["layernorm_scale"], + }, + "op_outputs": { + "Y": ["layernorm_out"], + "Mean": ["layernorm_mean"], + "Variance": ["layernorm_variance"], + }, + "op_attrs": { + "epsilon": 1e-5, + "begin_norm_axis": dics[ + "begin_norm_axis" + ], + }, + }, + ] + ops = self.generate_op_config(ops_config) + program_config = ProgramConfig( + ops=ops, + weights={ + "conv2d_filter": TensorConfig( + data_gen=partial( + conv_filter_datagen, dics + ) + ), + "elementwise_bias": TensorConfig( + data_gen=partial( + elementwise_bias_datagen, dics + ) + ), + "layernorm_bias": TensorConfig( + data_gen=partial( + layernorm_bias_datagen, dics + ) + ), + "layernorm_scale": TensorConfig( + data_gen=partial( + layernorm_scale_datagen, dics + ) + ), + }, + inputs={ + "conv2d_input": TensorConfig( + data_gen=partial( + conv2d_input_datagen, dics + ) + ), + }, + outputs=["reshape_out", "layernorm_out"], + ) + yield program_config + + def sample_predictor_configs( + self, program_config + ) -> (paddle_infer.Config, List[int], float): + def generate_dynamic_shape(attrs, inputs): + conv2d_c = inputs['conv2d_input'].shape[1] + self.dynamic_shape.min_input_shape = { + "conv2d_input": [1, conv2d_c, 32, 32], + "conv2d_filter": [conv2d_c, conv2d_c, 1, 1], + "elementwise_bias": [conv2d_c], + "layernorm_bias": [conv2d_c], + "layernorm_scale": [conv2d_c], + } + self.dynamic_shape.max_input_shape = { + "conv2d_input": [4, conv2d_c, 64, 64], + "conv2d_filter": [conv2d_c, conv2d_c, 1, 1], + "elementwise_bias": [conv2d_c], + "layernorm_bias": [conv2d_c], + "layernorm_scale": [conv2d_c], + } + self.dynamic_shape.opt_input_shape = { + "conv2d_input": [4, conv2d_c, 64, 64], + "conv2d_filter": [conv2d_c, conv2d_c, 1, 1], + "elementwise_bias": [conv2d_c], + "layernorm_bias": [conv2d_c], + "layernorm_scale": [conv2d_c], + } + + def clear_dynamic_shape(): + self.dynamic_shape.min_input_shape = {} + self.dynamic_shape.max_input_shape = {} + self.dynamic_shape.opt_input_shape = {} + + def generate_trt_nodes_num(attrs, dynamic_shape): + return 1, 3 + + attrs = [ + program_config.ops[i].attrs for i in range(len(program_config.ops)) + ] + inputs = program_config.inputs + # just support dynamic_shape + generate_dynamic_shape(attrs, inputs) + self.trt_param.precision = paddle_infer.PrecisionType.Float32 + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), 1e-5 + self.trt_param.precision = paddle_infer.PrecisionType.Half + yield self.create_inference_config(), generate_trt_nodes_num( + attrs, True + ), ( + 1e-2, + 1e-2, + ) # tol 1e-2 for half + + def add_skip_trt_case(self): + pass + + def test(self): + self.add_skip_trt_case() + self.run_test() + + +if __name__ == "__main__": + unittest.main()