diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 56f7fe75d2062..20d1df89bebe1 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -248,6 +248,8 @@ if(WITH_XPU) pass_library(stack_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(fused_multi_transformer_cachekv_layout_trans_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(add_activation_xpu_fuse_pass inference DIR xpu DEPS + ${XPU_PASS_DEPS}) endif() cc_library( diff --git a/paddle/fluid/framework/ir/xpu/add_activation_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/add_activation_xpu_fuse_pass.cc new file mode 100644 index 0000000000000..102ca018ef8c3 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/add_activation_xpu_fuse_pass.cc @@ -0,0 +1,195 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/ir/xpu/quant_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +/* +fuse ele_add + activation block in to xpu_ele_fusion op +For example: +graph: + ele_x + | + | + elementwise_add -----ele_y + | + | + act + | + | + out_Out +------------------------------------------------------ +After the pass is applied: + Input + | ele_y + | / + | / + Input_max ---- add_act_fusion ---- ele_y_max + | \ + | \ + | OutputMax + Output +*/ +struct AddActXPUPattern : public PatternBase { + AddActXPUPattern(PDPattern* pattern, + const std::string& name_scope, + const std::string& act_type); + // declare operator node's name + PATTERN_DECL_NODE(ele_add); + PATTERN_DECL_NODE(act); + // declare variable node's name + PATTERN_DECL_NODE(ele_x); + PATTERN_DECL_NODE(ele_y); + PATTERN_DECL_NODE(ele_out); + PATTERN_DECL_NODE(act_out); + + private: + std::string act_type_; +}; + +AddActXPUPattern::AddActXPUPattern(PDPattern* pattern, + const std::string& name_scope, + const std::string& act_type) + : PatternBase(pattern, name_scope, name_scope), act_type_(act_type) { + auto ele_add = + pattern->NewNode(ele_add_repr())->assert_is_op("elementwise_add"); + auto ele_x = pattern->NewNode(ele_x_repr()) + ->assert_is_op_input("elementwise_add", "X") + ->assert_var_not_persistable() + ->AsInput(); + auto ele_y = pattern->NewNode(ele_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->assert_var_not_persistable() + ->AsInput(); + auto ele_out = pattern->NewNode(ele_out_repr()) + ->assert_is_op_output("elementwise_add", "Out") + ->assert_has_n_outputs(1); + ele_add->LinksFrom({ele_x, ele_y}).LinksTo({ele_out}); + ele_out->assert_is_op_input(act_type_, "X"); + auto act = pattern->NewNode(act_repr())->assert_is_op(act_type_); + auto act_out = + pattern->NewNode(act_out_repr())->assert_is_op_output(act_type_, "Out"); + act->LinksFrom({ele_out}).LinksTo({act_out}); +} + +} // namespace patterns + +class AddActXPUFusePass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + int ApplyImpl(ir::Graph* graph, const std::string& act_type) const; + + const std::string name_scope_{"add_activation_xpu_fuse_pass"}; +}; + +void AddActXPUFusePass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + + int found_subgraph_count = 0; + for (auto act_type : {"relu", "gelu"}) { + found_subgraph_count += ApplyImpl(graph, act_type); + } + AddStatis(found_subgraph_count); +} + +int AddActXPUFusePass::ApplyImpl(ir::Graph* graph, + const std::string& act_type) const { + GraphPatternDetector gpd; + patterns::AddActXPUPattern pattern( + gpd.mutable_pattern(), name_scope_, act_type); + int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle AddActXPUFusePass fuse"; + /* declare operator node's name */ + GET_IR_NODE(ele_add); + GET_IR_NODE(act); + /* declare variable node's name*/ + GET_IR_NODE(ele_x); + GET_IR_NODE(ele_y); + GET_IR_NODE(ele_out); + GET_IR_NODE(act_out); + auto* block = ele_add->Op()->Block(); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); + std::string fused_op_out_name; + fused_op_out_name = act_out->Name(); + std::string fused_op_out_max_name = fused_op_out_name + "_max"; + VarDesc fused_op_out_max_desc(fused_op_out_max_name); + Node* fused_op_out_max = graph->CreateVarNode(&fused_op_out_max_desc); + // Generate add_act fused op + framework::OpDesc fused_op_desc(block); + fused_op_desc.SetType("add_act_xpu"); + // set attrs for fused op + fused_op_desc.SetAttr("act_type", ConvertActivationType(act_type)); + fused_op_desc.SetInput("x", {ele_x->Name()}); + fused_op_desc.SetInput("y", {ele_y->Name()}); + fused_op_desc.SetOutput("out", {fused_op_out_name}); + fused_op_desc.SetOutput("out_max", {fused_op_out_max_name}); + // relink fused op + auto* fused_op = graph->CreateOpNode(&fused_op_desc); + IR_NODE_LINK_TO(ele_x, fused_op); + IR_NODE_LINK_TO(ele_y, fused_op); + IR_NODE_LINK_TO(fused_op, act_out); + IR_NODE_LINK_TO(fused_op, fused_op_out_max); + // delete useless node + std::unordered_set delete_nodes = {ele_add, act, ele_out}; + GraphSafeRemoveNodes(graph, delete_nodes); + found_subgraph_count++; + }; + + gpd(graph, handler); + return found_subgraph_count; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(add_activation_xpu_fuse_pass, + paddle::framework::ir::AddActXPUFusePass); + +REGISTER_PASS_CAPABILITY(add_activation_xpu_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination().EQ( + "add_act_xpu", 0)); diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index 29f8aaea50a67..31e8618e077e0 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -12,13 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h" #include #include "glog/logging.h" -#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" @@ -36,165 +35,211 @@ class Scope; namespace paddle { namespace framework { namespace ir { + namespace patterns { +struct LinkAddActPattern : public PatternBase { + LinkAddActPattern(PDPattern* pattern, const std::string& name_scope); + + // declare operator node's name + PATTERN_DECL_NODE(fusion_op); + // declare variable node's name + PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(ele_y); +}; + +LinkAddActPattern::LinkAddActPattern(PDPattern* pattern, + const std::string& name_scope) + : PatternBase(pattern, name_scope, name_scope) { + auto* fusion_op = + pattern->NewNode(fusion_op_repr())->assert_is_op("add_act_xpu"); + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("add_act_xpu", "x"); + auto* ele_y = + pattern->NewNode(ele_y_repr())->assert_is_op_input("add_act_xpu", "y"); + fusion_op->LinksFrom({x, ele_y}); +} -struct FusionXPUOpPattern : public PatternBase { - FusionXPUOpPattern(PDPattern* pattern, - const std::string& name_scope, - const std::string& op_type, - bool with_branch); +struct LinkConv2dPattern : public PatternBase { + LinkConv2dPattern(PDPattern* pattern, + const std::string& name_scope, + bool with_branch); // declare operator node's name PATTERN_DECL_NODE(fusion_op); // declare variable node's name - PATTERN_DECL_NODE(input); + PATTERN_DECL_NODE(x); PATTERN_DECL_NODE(branch); private: - std::string op_type_; bool with_branch_{false}; }; -FusionXPUOpPattern::FusionXPUOpPattern(PDPattern* pattern, - const std::string& name_scope, - const std::string& op_type, - bool with_branch) - : PatternBase(pattern, name_scope, name_scope), - op_type_(op_type), - with_branch_(with_branch) { - auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op(op_type_); - auto* input = - pattern->NewNode(input_repr())->assert_is_op_input(op_type_, "x"); - +LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern, + const std::string& name_scope, + bool with_branch) + : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) { + auto* fusion_op = + pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu"); + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x"); PDNode* branch = nullptr; if (with_branch_) { - branch = - pattern->NewNode(branch_repr())->assert_is_op_input(op_type_, "branch"); - fusion_op->LinksFrom({input, branch}); - } else { - fusion_op->LinksFrom({input}); + branch = pattern->NewNode(branch_repr()) + ->assert_is_op_input("conv2d_xpu", "branch"); + fusion_op->LinksFrom({branch}); } + fusion_op->LinksFrom({x}); } -} // namespace patterns +struct LinkFcPattern : public PatternBase { + LinkFcPattern(PDPattern* pattern, const std::string& name_scope); -class LinkXPUOpMaxPass : public FusePassBase { - protected: - void ApplyImpl(ir::Graph* graph) const override; + // declare operator node's name + PATTERN_DECL_NODE(fusion_op); + // declare variable node's name + PATTERN_DECL_NODE(x); +}; - private: - void ApplyImpl(ir::Graph* graph, - const std::string& op_type, - bool with_branch) const; +LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, name_scope) { + auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu"); + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x"); - const std::string name_scope_{"link_xpu_op_max_pass"}; - // ops with x_max/out_max - std::set op_types_{"fc_xpu", "conv2d_xpu"}; -}; + fusion_op->LinksFrom({x}); +} -/* -Origin subgraph: - fusion_xpu_op0 - / \ - | | - out0 out0_max - | - \ - fusion_op -Fused subgraph: - fusion_xpu_op0 - / \ - | | - out0 out0_max - | | - \ / - fusion_op - -Origin subgraph1: - fusion_xpu_op0 fusion_xpu_op1 - / \ / \ - | | | | - out0 out0_max out1 out1_max - | | - (x) \ / (branch) - fusion_xpu_op2 -Fused subgraph1: - fusion_xpu_op0 fusion_xpu_op1 - / \ / \ - | | | | - out0 out0_max out1 out1_max - | | | | - (x) \ |(x_max) |(branch) /(branch_max) - \ | | / - \ | | / - \ | | / - fusion_xpu_op2 -*/ -void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const { - Init(name_scope_, graph); - for (auto op_type : op_types_) { - for (auto with_branch : {true, false}) { - ApplyImpl(graph, op_type, with_branch); +} // namespace patterns + +void LinkXPUOpMaxPass::LinkAddActMax(ir::Graph* graph) const { + GraphPatternDetector gpd; + patterns::LinkAddActPattern pattern(gpd.mutable_pattern(), name_scope_); + int found_subgraph_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle LinkAddActMax"; + /* declare operator node's name */ + GET_IR_NODE(fusion_op); + /* declare variable node's name*/ + GET_IR_NODE(x); + GET_IR_NODE(ele_y); + auto* fusion_op_desc = fusion_op->Op(); + auto* x_pre_op = x->inputs[0]->Op(); + if (x->inputs.size() > 0 && x->inputs[0]->IsOp() && + x_pre_op->HasOutput("out_max")) { + auto preop_max_var_name = x_pre_op->Output("out_max"); + for (auto max_node : x->inputs[0]->outputs) { + if (preop_max_var_name[0] == max_node->Name()) { + fusion_op_desc->SetInput("x_max", {max_node->Name()}); + IR_NODE_LINK_TO(max_node, fusion_op); + } + } } - } + auto* ele_y_pre_op = ele_y->inputs[0]->Op(); + if (ele_y->inputs.size() > 0 && ele_y->inputs[0]->IsOp() && + ele_y_pre_op->HasOutput("out_max")) { + auto preop_max_var_name = ele_y_pre_op->Output("out_max"); + for (auto max_node : ele_y->inputs[0]->outputs) { + if (preop_max_var_name[0] == max_node->Name()) { + fusion_op_desc->SetInput("y_max", {max_node->Name()}); + IR_NODE_LINK_TO(max_node, fusion_op); + } + } + } + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); } -void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph, - const std::string& op_type, - bool with_branch) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::PreconditionNotMet("graph should not be null.")); +void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const { GraphPatternDetector gpd; - patterns::FusionXPUOpPattern pattern( - gpd.mutable_pattern(), name_scope_, op_type, with_branch); - + patterns::LinkConv2dPattern pattern( + gpd.mutable_pattern(), name_scope_, with_branch); int found_subgraph_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { - VLOG(4) << "handle LinkXPUOpMaxPass fuse"; + VLOG(4) << "handle LinkConv2dMax"; + /* declare operator node's name */ GET_IR_NODE(fusion_op); - GET_IR_NODE(input); + /* declare variable node's name*/ + GET_IR_NODE(x); GET_IR_NODE(branch); - auto* fusion_op_desc = fusion_op->Op(); - if (fusion_op_desc->HasAttr("has_branch")) { - bool fusion_op_branch = - PADDLE_GET_CONST(bool, fusion_op_desc->GetAttr("has_branch")); - if (fusion_op_branch != with_branch) { - return; - } - } - if (input->inputs.size() > 0 && input->inputs[0]->IsOp() && - input->inputs[0]->Op()->HasOutput("out_max")) { - auto input_max_name = input->inputs[0]->Op()->Output("out_max"); - for (auto max_node : input->inputs[0]->outputs) { - if (input_max_name[0] == max_node->Name()) { + auto* x_pre_op = x->inputs[0]->Op(); + if (x->inputs.size() > 0 && x->inputs[0]->IsOp() && + x_pre_op->HasOutput("out_max")) { + auto preop_max_var_name = x_pre_op->Output("out_max"); + for (auto max_node : x->inputs[0]->outputs) { + if (preop_max_var_name[0] == max_node->Name()) { fusion_op_desc->SetInput("x_max", {max_node->Name()}); IR_NODE_LINK_TO(max_node, fusion_op); - found_subgraph_count++; } } } - if (with_branch) { + auto* branch_pre_op = branch->inputs[0]->Op(); if (branch->inputs.size() > 0 && branch->inputs[0]->IsOp() && - branch->inputs[0]->Op()->HasOutput("out_max")) { - auto branch_max_name = branch->inputs[0]->Op()->Output("out_max"); + branch_pre_op->HasOutput("out_max")) { + auto preop_max_var_name = branch_pre_op->Output("out_max"); for (auto max_node : branch->inputs[0]->outputs) { - if (branch_max_name[0] == max_node->Name()) { + if (preop_max_var_name[0] == max_node->Name()) { fusion_op_desc->SetInput("branch_max", {max_node->Name()}); IR_NODE_LINK_TO(max_node, fusion_op); - found_subgraph_count++; } } } } + found_subgraph_count++; }; gpd(graph, handler); AddStatis(found_subgraph_count); } +void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const { + GraphPatternDetector gpd; + patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_); + int found_subgraph_count = 0; + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + VLOG(4) << "handle LinkFcMax"; + /* declare operator node's name */ + GET_IR_NODE(fusion_op); + /* declare variable node's name*/ + GET_IR_NODE(x); + auto* fusion_op_desc = fusion_op->Op(); + auto* x_pre_op = x->inputs[0]->Op(); + if (x->inputs.size() > 0 && x->inputs[0]->IsOp() && + x_pre_op->HasOutput("out_max")) { + auto preop_max_var_name = x_pre_op->Output("out_max"); + for (auto max_node : x->inputs[0]->outputs) { + if (preop_max_var_name[0] == max_node->Name()) { + fusion_op_desc->SetInput("x_max", {max_node->Name()}); + IR_NODE_LINK_TO(max_node, fusion_op); + } + } + } + found_subgraph_count++; + }; + + gpd(graph, handler); + AddStatis(found_subgraph_count); +} + +void LinkXPUOpMaxPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + + LinkFcMax(graph); + for (auto with_branch : {true, false}) { + LinkConv2dMax(graph, with_branch); + } + LinkAddActMax(graph); +} + } // namespace ir } // namespace framework } // namespace paddle @@ -203,5 +248,7 @@ REGISTER_PASS(link_xpu_op_max_pass, paddle::framework::ir::LinkXPUOpMaxPass); REGISTER_PASS_CAPABILITY(link_xpu_op_max_pass) .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination().EQ( - "fc_xpu", 0)); + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("fc_xpu", 0) + .EQ("conv2d_xpu", 0) + .EQ("add_act_xpu", 0)); diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h new file mode 100644 index 0000000000000..cad199ce573bb --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h @@ -0,0 +1,110 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +class LinkXPUOpMaxPass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + + private: + /* + Origin subgraph: + fusion_xpu_op0 + / \ + | | + out0 out0_max + | + \ + fc_xpu +Fused subgraph: + fusion_xpu_op0 + / \ + | | + out0 out0_max + | | + \ / + fc_xpu + */ + void LinkFcMax(ir::Graph* graph) const; + + /* + Origin subgraph: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | + (x) \ / (branch) + conv2d_xpu +Fused subgraph: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | | | + (x) \ |(x_max) |(branch) /(branch_max) + \ | | / + \ | | / + \ | | / + conv2d_xpu + */ + void LinkConv2dMax(ir::Graph* graph, bool with_branch) const; + + /* + Origin subgraph: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | + (x) \ / (y) + add_act_xpu + Fused subgraph: + fusion_xpu_op0 fusion_xpu_op1 + / \ / \ + | | | | + out0 out0_max out1 out1_max + | | | | + (x) \ |(x_max) |(y) /(y_max) + \ | | / + \ | | / + \ | | / + add_act_xpu + */ + void LinkAddActMax(ir::Graph* graph) const; + + const std::string name_scope_{"link_xpu_op_max_pass"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 60bf333df5272..b1f92d7b640c7 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -527,6 +527,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "sigmoid_elementmul_fuse_pass", "fc_xpu_fuse_pass", "conv2d_xpu_fuse_pass", + "add_activation_xpu_fuse_pass", "link_xpu_op_max_pass", "inplace_op_var_pass", "delete_isolated_node_pass", diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml index 1e85309c28a77..c8be15e5b5551 100644 --- a/paddle/phi/api/yaml/fused_ops.yaml +++ b/paddle/phi/api/yaml/fused_ops.yaml @@ -4,6 +4,16 @@ # if one operator have "support_dygraph_mode : true", it supports dygraph mode, # otherwise the operator only could be used in static mode. +- op : add_act_xpu + args : (Tensor x, Tensor x_max, Tensor y, Tensor y_max, int act_type) + output : Tensor(out), Tensor(out_max) + infer_meta : + func : AddActXPUInferMeta + kernel : + func : add_act_xpu + data_type : x + optional : x_max, y_max + - op : conv2d_xpu args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, bool has_bias, bool has_branch, int act_type, float act_param) output : Tensor(out), Tensor(out_max) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index a8bf526cf87b4..4470920f0b1fd 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -22,6 +22,8 @@ namespace xpu { XPUOpMap& get_kl2_ops() { // KL2支持的op,通过op_name, data_type, place来索引 static XPUOpMap s_xpu2_kernels{ + {"add_act_xpu", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"abs", XPUKernelSet({phi::DataType::FLOAT32})}, {"abs_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 39a76bc7143b9..78bf3d01de090 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -19,9 +19,66 @@ limitations under the License. */ #include "paddle/phi/core/infermeta_utils.h" #include "paddle/phi/core/meta_tensor.h" #include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/funcs/common_shape.h" namespace phi { +void AddActXPUInferMeta(const MetaTensor& x, + const MetaTensor& x_max, + const MetaTensor& y, + const MetaTensor& y_max, + int act_type, + MetaTensor* out, + MetaTensor* out_max) { + int axis = -1; + if (x.dims() != y.dims()) { + auto x_dims = x.dims(); + auto y_dims = y.dims(); + int max_dim = std::max(x_dims.size(), y_dims.size()); + if (x_dims.size() == y_dims.size()) { + PADDLE_ENFORCE_EQ((axis == -1) || (axis == 0), + true, + phi::errors::InvalidArgument( + "axis should be -1 or 0 while the dimension of " + "tensor X (%s) is equal to the dimension of " + "tensor Y (%s), but received axis: %s", + x_dims.size(), + y_dims.size(), + axis)); + } + PADDLE_ENFORCE_EQ((axis >= (-1 * max_dim)) && (axis < max_dim), + true, + phi::errors::InvalidArgument( + "The axis range must be [%s, %s), but axis is %s. " + "Please set the axis again.", + -1 * max_dim, + max_dim, + axis)); + axis = (axis < 0 ? (std::abs(x_dims.size() - y_dims.size()) + axis + 1) + : axis); + std::vector x_dims_array(max_dim); + std::vector y_dims_array(max_dim); + std::vector out_dims_array(max_dim); + funcs::GetBroadcastDimsArrays(x_dims, + y_dims, + x_dims_array.data(), + y_dims_array.data(), + out_dims_array.data(), + max_dim, + axis); + auto out_dims = phi::make_ddim(out_dims_array); + out->set_dims(out_dims); + } else { + out->set_dims(x.dims()); + } + out->set_dtype(x.dtype()); + out->set_layout(x.layout()); + out->share_lod(x); + out_max->set_dims(phi::make_ddim({6})); + out_max->set_dtype(x.dtype()); + out_max->set_layout(x.layout()); +} + inline int ConvOutSize(int input_size, int filter_size, int dilation, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index 7a844eeff0c9b..751bbfd95f764 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -22,6 +22,14 @@ namespace phi { // Common InferMeta Functions for fusion operators. // NOTE: The InferMeta Functions in this file are arranged in alphabetic order. +void AddActXPUInferMeta(const MetaTensor& x, + const MetaTensor& x_max, + const MetaTensor& y, + const MetaTensor& y_max, + int act_type, + MetaTensor* out, + MetaTensor* out_max); + void Conv2dXPUInferMeta(const MetaTensor& x, const MetaTensor& x_max, const MetaTensor& filter, diff --git a/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc new file mode 100644 index 0000000000000..ca221cbc7f412 --- /dev/null +++ b/paddle/phi/kernels/fusion/xpu/add_act_xpu_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +namespace fusion { + +template +void AddActXPUKernel(const Context& ctx, + const DenseTensor& x, + const paddle::optional& x_max, + const DenseTensor& y, + const paddle::optional& y_max, + int act_type, + DenseTensor* out, + DenseTensor* out_max) { + using XPUType = typename XPUTypeTrait::Type; + + auto* x_data = reinterpret_cast(x.data()); + const float* x_max_data = + x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data(); + auto* y_data = reinterpret_cast(y.data()); + const float* y_max_data = + y_max.get_ptr() == nullptr ? nullptr : y_max.get_ptr()->data(); + auto* out_data = reinterpret_cast(ctx.template Alloc(out)); + + std::vector x_shape = phi::vectorize(x.dims()); + std::vector y_shape = phi::vectorize(y.dims()); + xpu::Activation_t act(static_cast(act_type)); + int r = + xpu::add_activation_fusion( // TX/TY/TZ/TID + /* baidu::xpu::api::Context* ctx */ ctx.x_context(), + /* const TX* x */ x_data, + /* const TY* y */ y_data, + /* TZ* z */ out_data, + /* const std::vector& x_shape */ x_shape, + /* const std::vector& y_shape */ y_shape, + /* const float* max_x */ x_max_data, + /* const float* max_y */ y_max_data, + /* float* max_z */ ctx.template Alloc(out_max), + /* const baidu::xpu::api::Activation_t& act */ act); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "add_act_xpu"); +} + +} // namespace fusion +} // namespace phi + +PD_REGISTER_KERNEL(add_act_xpu, + XPU, + ALL_LAYOUT, + phi::fusion::AddActXPUKernel, + float, + phi::dtype::float16) {} diff --git a/test/ir/inference/test_xpu_add_activation_fuse_pass.py b/test/ir/inference/test_xpu_add_activation_fuse_pass.py new file mode 100644 index 0000000000000..26d0333bbb161 --- /dev/null +++ b/test/ir/inference/test_xpu_add_activation_fuse_pass.py @@ -0,0 +1,77 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from functools import partial + +import hypothesis.strategies as st +import numpy as np +from auto_scan_test import PassAutoScanTest +from program_config import OpConfig, ProgramConfig, TensorConfig + + +class TestAddActXPUFusePass(PassAutoScanTest): + def sample_predictor_configs(self, program_config): + config = self.create_inference_config(use_xpu=True) + yield config, ["add_act_xpu"], (1e-3, 1e-3) + + def sample_program_config(self, draw): + batch_size = draw(st.integers(min_value=1, max_value=50)) + + # Generate shape of input:X Y of ele_add + def generate_input(): + return np.random.random([batch_size, 3, 100, 100]).astype( + np.float32 + ) + + axis = -1 + + # Here we will compose a program + # Still has some risks that the program is invalid or cause bug while running + # Use function `is_program_valid` to filter the invalid programs before running + # Use function `add_skip_pass_case` to ignore the programs even if they cause bug while runing + elementwise_op = OpConfig( + type='elementwise_add', + inputs={'X': ['eltwise_X'], 'Y': ['eltwise_Y']}, + outputs={'Out': ['eltwise_output']}, + axis=axis, + ) + relu_op = OpConfig( + "relu", + inputs={"X": ["eltwise_output"]}, + outputs={"Out": ["relu_out"]}, + ) + mini_graph = [elementwise_op, relu_op] + + program_config = ProgramConfig( + ops=mini_graph, + weights={}, + inputs={ + "eltwise_X": TensorConfig(data_gen=partial(generate_input)), + "eltwise_Y": TensorConfig(data_gen=partial(generate_input)), + }, + outputs=mini_graph[-1].outputs["Out"], + ) + return program_config + + def test(self): + self.run_and_statis( + quant=False, + max_examples=25, + passes=["add_activation_xpu_fuse_pass"], + ) + + +if __name__ == "__main__": + unittest.main()