From 26e125de993545807af63c56a307a959eab9c6df Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 19 Sep 2023 17:07:35 +0800 Subject: [PATCH 01/15] support int8 --- paddle/fluid/framework/ir/CMakeLists.txt | 2 + .../framework/ir/auto_mixed_precision_pass.cc | 21 +- .../ir/delete_quant_dequant_linear_op_pass.cc | 6 +- .../delete_weight_dequant_linear_op_pass.cc | 36 +- .../auto_trans_quantize_op_precision_pass.cc | 130 +++ .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 950 ++++++++++++++---- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 676 ++++++++++--- .../framework/ir/xpu/link_xpu_op_max_pass.cc | 17 +- paddle/fluid/framework/ir/xpu/pass_utils.cc | 104 ++ paddle/fluid/framework/ir/xpu/pass_utils.h | 32 + paddle/fluid/framework/ir/xpu/quant_utils.cc | 146 +++ paddle/fluid/framework/ir/xpu/quant_utils.h | 28 + .../inference/analysis/passes/CMakeLists.txt | 9 +- .../passes/convert_to_mixed_precision.cc | 12 + .../inference/api/paddle_pass_builder.cc | 3 + paddle/phi/api/yaml/fused_ops.yaml | 8 +- paddle/phi/backends/xpu/xpu2_op_list.cc | 8 +- paddle/phi/infermeta/fusion.cc | 4 + paddle/phi/infermeta/fusion.h | 4 + .../kernels/fusion/xpu/conv2d_xpu_kernel.cc | 427 +++++++- .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 137 ++- 21 files changed, 2390 insertions(+), 370 deletions(-) create mode 100644 paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index e67dfa5adf910..e9a8e4cc22cac 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -247,6 +247,8 @@ if(WITH_XPU) # pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS + ${XPU_PASS_DEPS}) pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 698de5d90c256..497dcae8395d5 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -734,6 +734,11 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert( } void AutoMixedPrecisionPass::SetVarPrecision() const { + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL(scope, + platform::errors::PreconditionNotMet( + "During the auto_mixed_precision_pass, the scope " + "should not be null.")); for (const auto& nodes : all_op_nodes_) { for (auto* op_node : nodes) { if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) { @@ -750,7 +755,21 @@ void AutoMixedPrecisionPass::SetVarPrecision() const { if (!IsFP32AndFP64(real_in_var_node->Var()->GetDataType())) continue; if (!VarNodeHasDtype(real_in_var_node)) continue; if (InputVarsNotConvert(op_node, in_var_name)) continue; - + // Judge the real tensor is same to variable, Paddle-Slim weight use + // fp32 variable to save int8 tensor. + if (real_in_var_node->Var()->Persistable()) { + auto* tensor = scope->Var(real_in_var_node->Name()) + ->GetMutable(); + if (framework::TransToProtoVarType(tensor->type()) != + real_in_var_node->Var()->GetDataType()) { + VLOG(1) << "[AutoMixedPrecisionPass] variable " + << real_in_var_node->Name() << "'s proto data type " + << real_in_var_node->Var()->GetDataType() + << " is different from real dense tensor " + << framework::TransToProtoVarType(tensor->type()); + continue; + } + } if (real_in_var_node->Var()->Persistable()) { real_in_var_node->Var()->SetDataType( framework::TransToProtoVarType(low_precision_)); diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index cb6a6e1d5d9dc..42c7f7acdc103 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -137,11 +137,15 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { int nums_any_ops = static_cast(dequantize_linear_op_out->outputs.size()); + int bit_length = + PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length")); for (int i = 0; i < nums_any_ops; ++i) { auto* any_op_desc = dequantize_linear_op_out->outputs[i]->Op(); any_op_desc->SetAttr("Input_scale_" + quantize_linear_op_x->Var()->Name(), input_scale); - + any_op_desc->SetAttr( + "Input_bit_length_" + quantize_linear_op_x->Var()->Name(), + bit_length); // link x to any_op2 any_op_desc->RenameInput(dequantize_linear_op_out->Var()->Name(), quantize_linear_op_x->Var()->Name()); diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index 0b09d1b30f40a..0140fb664b1de 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -35,7 +35,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { true, platform::errors::InvalidArgument( "Graph must have kParamScopeAttr attribute.")); - + VLOG(1) << "Handle delete weight dequant linear op pass ..."; auto& scope = graph->Get(kParamScopeAttr); bool is_int8 = false; @@ -44,7 +44,9 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); + VLOG(1) << "Dequantize linear op Type: " << op->Type(); if (op->Type() == "dequantize_linear") { + VLOG(1) << "Dequantize linear op is come in: " << op->Type(); Node *weight_var_node, *calcu_op_node, *while_op_node; Node *dequantized_weight_var_node = nullptr, *scale_var_node = nullptr; // 1. Judge whether for dequant weight and find @@ -110,6 +112,8 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { weight_scale_tensor->dtype())); } + int bit_length = + PADDLE_GET_CONST(int, op->GetAttr("bit_length")); int quant_axis = PADDLE_GET_CONST(int, op->GetAttr("quant_axis")); if (quant_axis == -1) { // per_layer quant_dequant: all OP @@ -124,14 +128,36 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { calcu_op_desc->SetAttr("weight_scale", weight_scale[0]); } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Delete Weight Dequant Linear Op Pass is not supported " - "for " - "per-channel quantization")); + std::vector weights_shape = + weight_var_node->Var()->GetShape(); + quant_axis = quant_axis >= 0 + ? quant_axis + : quant_axis + weights_shape.size(); + PADDLE_ENFORCE_EQ( + weight_scale_nums, + weights_shape[quant_axis], + platform::errors::InvalidArgument( + "When quant_axis != -1, it means using per_channel " + "dequantization. In this situation, the number of " + "weight_scale should be equal with " + "weights_shape[quant_axis=%d]=%ld , but received " + "%d.", + quant_axis, + weights_shape[quant_axis], + weight_scale_nums)); + calcu_op_desc->SetAttr("weight_scale", weight_scale); } + calcu_op_desc->SetAttr("weight_quant_axis", quant_axis); + calcu_op_desc->SetAttr("weight_bit_length", bit_length); + calcu_op_desc->SetAttr("enable_int8", true); + VLOG(1) << "dequantized_weight_var_node->Var()->Name():" + << dequantized_weight_var_node->Var()->Name(); + VLOG(1) << "weight_var_node->Var()->Name(): " + << weight_var_node->Var()->Name(); calcu_op_desc->RenameInput( dequantized_weight_var_node->Var()->Name(), weight_var_node->Var()->Name()); + calcu_op_desc->Flush(); } } } diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc new file mode 100644 index 0000000000000..c8b4b7c040f7e --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc @@ -0,0 +1,130 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "glog/logging.h" + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace phi { +class DenseTensor; +} // namespace phi + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +class AutoTransQuantizeOpPrecisionPass : public FusePassBase { + protected: + void ApplyImpl(ir::Graph* graph) const override; + void FirstRound(ir::Graph* graph) const; + + const std::string name_scope_{"auto_trans_quantize_op_precision_pass"}; + const std::unordered_set support_fusion_quant_op_type_{ + "conv2d_xpu"}; +}; + +static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph, + Node* op_node, + const std::string& arg_name) { + CHECK_EQ(op_node->IsOp(), true); + auto* op_desc = op_node->Op(); + auto out_var_nodes = op_desc->Output(arg_name); + CHECK_EQ(out_var_nodes.size(), 1UL); + auto out_var_name = out_var_nodes[0]; + auto out_var_node = FindNodeWithName(graph, out_var_name); + return out_var_node; +} + +void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const { + auto graph_size = graph->SubGraphsSize(); + VLOG(1) << "There is " << graph_size << " subgraphs need to be handle."; + for (size_t i = 0; i < graph_size; i++) { + auto subgraph = graph->GetSubGraph(i); + VLOG(1) << "Handling the subgraph id: " << i; + for (auto* op_node : TopologySortOperations(*subgraph)) { + auto op_type = op_node->Op()->Type(); + if (support_fusion_quant_op_type_.find(op_type) != + support_fusion_quant_op_type_.end()) { + bool enable_int8 = op_node->Op()->GetAttrIfExists("enable_int8"); + int out_dtype = op_node->Op()->GetAttrIfExists("out_dtype"); + if (enable_int8) { + if (op_type == "conv2d_xpu") { + auto* out_var_node = + GetOpOutVarNodeByArgsName(subgraph, op_node, "out"); + PADDLE_ENFORCE_NOT_NULL( + out_var_node, + platform::errors::InvalidArgument( + "out_var_node in graph cannot be nullptr.")); + bool is_int8_out = true; + for (auto* next_op_node : out_var_node->outputs) { + auto next_op_type = next_op_node->Op()->Type(); + bool is_next_op_support_int8 = + next_op_node->Op()->GetAttrIfExists("enable_int8") && + ((support_fusion_quant_op_type_.find(next_op_type) != + support_fusion_quant_op_type_.end())); + if (!is_next_op_support_int8) { + is_int8_out = false; + break; + } + } + if (is_int8_out) { + op_node->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + out_var_node->Var()->SetDataType( + proto::VarType::Type::VarType_Type_INT8); + VLOG(1) << "The out var node " << out_var_node->Name() + << " is INT8"; + } + } + } + } + } + } +} + +void AutoTransQuantizeOpPrecisionPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::PreconditionNotMet("graph should not be null.")); + Init(name_scope_, graph); + VLOG(1) << "AutoTransQuantizeOpPrecisionPass handling start ..."; + FirstRound(graph); + VLOG(1) << "AutoTransQuantizeOpPrecisionPass handleing end."; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(auto_trans_quantize_op_precision_pass, + paddle::framework::ir::AutoTransQuantizeOpPrecisionPass); + +REGISTER_PASS_CAPABILITY(auto_trans_quantize_op_precision_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("fc_xpu", 0) + .EQ("conv2d_xpu", 0)); diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 502c275a419d3..19e006d535409 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include #include "glog/logging.h" @@ -355,6 +356,49 @@ class Conv2dXPUFusePass : public FusePassBase { bool with_branch_x, bool with_branch_y) const; + Node* GetNodeFromNodesMap( + const std::map>& nodes_map, + std::string pattern_node_name, + std::string node_name) const; + + void CreateFusionWeightsAndBias( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool with_conv_bias, + bool with_bn, + bool with_scale, + bool enable_int8) const; + + void CreateFusionInputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const; + + void CreateFusionBranch( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const; + + void CreateFusionOutputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8, + std::string act_type) const; + + const std::unordered_set support_quant_op_type_{"conv2d", + "conv2d_xpu"}; const std::string name_scope_{"conv2d_xpu_fuse_pass"}; }; @@ -401,6 +445,535 @@ void Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_subgraph_count); } +Node* Conv2dXPUFusePass::GetNodeFromNodesMap( + const std::map>& nodes_map, + std::string pattern_node_name, + std::string node_name) const { + auto iter = nodes_map.find(pattern_node_name); + PADDLE_ENFORCE_EQ( + iter != nodes_map.end(), + true, + platform::errors::InvalidArgument("nodes_map[%s] not found in nodes_map", + pattern_node_name.c_str())); + auto node_map = iter->second; + auto node_iter = node_map.find(node_name); + PADDLE_ENFORCE_EQ(node_iter != node_map.end(), + true, + platform::errors::InvalidArgument( + "nodes_map[%s][%s] not found in nodes_map", + pattern_node_name.c_str(), + node_name.c_str())); + return node_iter->second; +} + +void Conv2dXPUFusePass::CreateFusionWeightsAndBias( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool with_conv_bias, + bool with_bn, + bool with_scale, + bool enable_int8) const { + // Get Node + auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); + PADDLE_ENFORCE_EQ( + conv != nullptr, + true, + platform::errors::InvalidArgument("conv node ptr can not be null")); + auto* conv_filter = GetNodeFromNodesMap(nodes_map, "conv", "conv_filter"); + PADDLE_ENFORCE_EQ(conv_filter != nullptr, + true, + platform::errors::InvalidArgument( + "conv_filter node ptr can not be null")); + + // transfilter fp16 --> fp32 + auto* filter_t = + scope->FindVar(conv_filter->Name())->GetMutable(); + auto filter_len = filter_t->numel(); + auto filter_dtype = filter_t->dtype(); + if (filter_dtype == phi::DataType::FLOAT16) { + CastToFp32(filter_t, nullptr); + } + + // Get Weight scale in int8 scene + std::vector weight_scale = + conv->Op()->GetAttrIfExists>("weight_scale"); + // Create fusion_bias_node + auto filter_dims = filter_t->dims(); + bool has_bias = with_bn || with_conv_bias; + Node* fusion_bias_node = nullptr; + if (with_conv_bias) { + auto* ew_bias_add_y = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_y"); + PADDLE_ENFORCE_EQ(ew_bias_add_y != nullptr, + true, + platform::errors::InvalidArgument( + "ew_bias_add_y node ptr can not be null")); + auto* ew_bias_add_y_t = + scope->FindVar(ew_bias_add_y->Name())->GetMutable(); + auto ew_bias_add_y_dims = ew_bias_add_y_t->dims(); + PADDLE_ENFORCE_EQ(filter_dims[0], + ew_bias_add_y_dims[0], + platform::errors::InvalidArgument( + "the shape[%d] of elewise bias tensor " + "must equal out_channel[%d] of conv", + ew_bias_add_y_dims[0], + filter_dims[0])); + PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node); + } + + if (with_bn) { + auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); + PADDLE_ENFORCE_EQ( + bn != nullptr, + true, + platform::errors::InvalidArgument("bn node ptr can not be null")); + auto* bn_bias = GetNodeFromNodesMap(nodes_map, "bn", "bn_bias"); + PADDLE_ENFORCE_EQ( + bn_bias != nullptr, + true, + platform::errors::InvalidArgument("bn_bias node ptr can not be null")); + auto* bn_scale = GetNodeFromNodesMap(nodes_map, "bn", "bn_scale"); + PADDLE_ENFORCE_EQ( + bn_scale != nullptr, + true, + platform::errors::InvalidArgument("bn_scale node ptr can not be null")); + auto* bn_var = GetNodeFromNodesMap(nodes_map, "bn", "bn_var"); + PADDLE_ENFORCE_EQ( + bn_var != nullptr, + true, + platform::errors::InvalidArgument("bn_var node ptr can not be null")); + auto* bn_mean = GetNodeFromNodesMap(nodes_map, "bn", "bn_mean"); + PADDLE_ENFORCE_EQ( + bn_mean != nullptr, + true, + platform::errors::InvalidArgument("bn_mean node ptr can not be null")); + + auto bn_bias_t = + scope->Var(bn_bias->Name())->GetMutable(); + PADDLE_ENFORCE_EQ( + filter_dims[0], + bn_bias_t->dims()[0], + platform::errors::InvalidArgument("the shape[%d] of bn bias tensor " + "must equal out_channel[%d] of conv", + bn_bias_t->dims()[0], + filter_dims[0])); + auto bn_scale_t = + scope->Var(bn_scale->Name())->GetMutable(); + auto bn_mean_t = + scope->Var(bn_mean->Name())->GetMutable(); + auto bn_var_t = scope->Var(bn_var->Name())->GetMutable(); + float* bn_scale_ptr = + bn_scale_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_bias_ptr = + bn_bias_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_mean_ptr = + bn_mean_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_var_ptr = + bn_var_t->mutable_data(paddle::platform::CPUPlace()); + auto mean_len = bn_mean_t->numel(); + auto filter_stride = filter_len / mean_len; + float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); + if (!with_conv_bias) { // prev node is conv + PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node); + } + + auto fusion_bias_t = + scope->Var(fusion_bias_node->Name())->GetMutable(); + float* fusion_bias_ptr = + fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + // recompute bias and weights + for (int i = 0; i < mean_len; ++i) { + bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); + } + // recompute the weights + if (!enable_int8) { + float* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; j++) { + filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; + } + } + } else { + int8_t* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + PADDLE_ENFORCE_EQ( + weight_scale.size(), + mean_len, + platform::errors::InvalidArgument( + "Weight max_scale size must equal batch_norm sacle/mean size.")); + for (int i = 0; i < mean_len; i++) { + weight_scale[i] *= fabs(bn_scale_ptr[i]); + } + for (int i = 0; i < mean_len; i++) { + if (bn_scale_ptr[i] < 0) { + for (int j = 0; j < filter_stride; ++j) { + filter_ptr[i * filter_stride + j] *= -1; + } + } + } + } + // recompute bias + if (!with_conv_bias) { + for (int i = 0; i < mean_len; ++i) { + fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i]; + } + } else { + for (int i = 0; i < mean_len; ++i) { + fusion_bias_ptr[i] = + bn_bias_ptr[i] + + (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i]; + } + } + } + + // deal with scale op + if (with_scale && !enable_int8) { + auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale"); + PADDLE_ENFORCE_EQ( + scale != nullptr, + true, + platform::errors::InvalidArgument("scale node ptr can not be null")); + auto bias_len = filter_dims[0]; + float scale_val_ = 1.f; + float bias_val_ = 0.f; + scale_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale")); + bias_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("bias")); + bool bias_after_scale_ = + PADDLE_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale")); + // recompute bias as scale op + auto fusion_bias_t = + scope->GetVar(fusion_bias_node->Name())->GetMutable(); + float* fusion_bias_ptr = + fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + for (int i = 0; i < bias_len; ++i) { + if (bias_after_scale_) { + fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_; + } else { + fusion_bias_ptr[i] = (fusion_bias_ptr[i] + bias_val_) * scale_val_; + } + } + // recompute weight as scale op + float* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + for (int i = 0; i < filter_len; ++i) { + filter_ptr[i] *= scale_val_; + } + } + + (*fusion_nodes_map)["bias"] = fusion_bias_node; + + Node* filter_intx = nullptr; + Node* filter_max = nullptr; + Node* scale_max = nullptr; + if (!enable_int8) { + PrepareWeight(graph, + scope, + block, + conv_filter, + &filter_intx, + &filter_max, + false, + weight_scale); + } else { + PrepareWeight(graph, + scope, + block, + conv_filter, + &filter_intx, + &filter_max, + false, + weight_scale); + } + + bool is_per_channel_need_create_scale_max_node = + !weight_scale.empty() && !IsPerTensorQuant(weight_scale); + if (is_per_channel_need_create_scale_max_node) { + phi::DenseTensor ones_weight_max_tensor; + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + int max_ptr_size = weight_scale.empty() + ? phi::backends::xpu::get_xpu_max_ptr_size(-1) + : weight_scale.size(); + ones_weight_max_tensor.set_type(phi::DataType::FLOAT32); + ones_weight_max_tensor.Resize({max_ptr_size}); + std::vector ones_weight(max_ptr_size, 1.0); + memcpy(cpu_ctx->Alloc(&ones_weight_max_tensor), + ones_weight.data(), + max_ptr_size * sizeof(float)); + + std::string scale_max_name = conv_filter->Name() + "_scale_max"; + VarDesc scale_max_desc(scale_max_name); + scale_max_desc.SetPersistable(true); + scale_max_desc.SetShape(vectorize(ones_weight_max_tensor.dims())); + scale_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + scale_max = graph->CreateVarNode(&scale_max_desc); + auto* block_scale_max_desc = block->Var(scale_max_name); + block_scale_max_desc->SetPersistable(scale_max_desc.Persistable()); + block_scale_max_desc->SetShape(scale_max_desc.GetShape()); + block_scale_max_desc->SetDataType(scale_max_desc.GetDataType()); + Assign(ones_weight_max_tensor, + scope->Var(scale_max_name)->GetMutable()); + } + + (*fusion_nodes_map)["filter"] = filter_intx; + if (is_per_channel_need_create_scale_max_node) { + (*fusion_nodes_map)["filter_max"] = scale_max; + (*fusion_nodes_map)["scale_max"] = filter_max; + } else { + (*fusion_nodes_map)["filter_max"] = filter_max; + (*fusion_nodes_map)["scale_max"] = scale_max; + } +} + +void Conv2dXPUFusePass::CreateFusionInputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const { + // Get Node + auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); + PADDLE_ENFORCE_EQ( + conv != nullptr, + true, + platform::errors::InvalidArgument("conv node ptr can not be null")); + auto* input = GetNodeFromNodesMap(nodes_map, "conv", "input"); + PADDLE_ENFORCE_EQ( + input != nullptr, + true, + platform::errors::InvalidArgument("conv input node ptr can not be null")); + // input max + std::string conv_input_max_name = input->Name() + "_input_max"; + Node* conv2d_xpu_input_max = nullptr; + if (enable_int8) { + float input_scale = + conv->Op()->GetAttrIfExists("Input_scale_" + input->Name()); + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + VarDesc conv_input_max_desc(conv_input_max_name); + conv_input_max_desc.SetPersistable( + true); // Need depends on ir_params_sync_among_devices_pass copy to xpu + // device + conv_input_max_desc.SetShape({static_cast(max_ptr_size)}); + conv_input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + conv2d_xpu_input_max = graph->CreateVarNode(&conv_input_max_desc); + auto input_max_tensor = + scope->Var(conv_input_max_name)->GetMutable(); + input_max_tensor->set_type(phi::DataType::FLOAT32); + input_max_tensor->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + std::vector input_scales(max_ptr_size, input_scale); + memcpy(cpu_ctx->Alloc(input_max_tensor), + input_scales.data(), + max_ptr_size * sizeof(float)); + } + (*fusion_nodes_map)["x"] = input; + (*fusion_nodes_map)["x_max"] = conv2d_xpu_input_max; +} + +void Conv2dXPUFusePass::CreateFusionBranch( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const { + // Get Node + auto* ew_branch_add = + GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add"); + if (ew_branch_add) { + auto* ew_branch_add_in = + GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_in"); + PADDLE_ENFORCE_EQ(ew_branch_add_in != nullptr, + true, + platform::errors::InvalidArgument( + "ew_branch_add_in node ptr can not be null")); + (*fusion_nodes_map)["branch"] = ew_branch_add_in; + // ew_branch_add_max + std::string ew_branch_add_max_name = + ew_branch_add_in->Name() + "branch_max"; + Node* ew_branch_add_max = FindNodeWithName(graph, ew_branch_add_max_name); + if (enable_int8 && !ew_branch_add_max) { + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name); + ew_branch_add_in_max_desc.SetPersistable( + true); // Need depends on ir_params_sync_among_devices_pass copy to + // xpu device + ew_branch_add_in_max_desc.SetShape({static_cast(max_ptr_size)}); + ew_branch_add_in_max_desc.SetDataType( + proto::VarType::Type::VarType_Type_FP32); + ew_branch_add_max = graph->CreateVarNode(&ew_branch_add_in_max_desc); + float ew_branch_add_scale = ew_branch_add->Op()->GetAttrIfExists( + "Input_scale_" + ew_branch_add_in->Name()); + auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); + PADDLE_ENFORCE_EQ( + conv != nullptr, + true, + platform::errors::InvalidArgument("conv node ptr can not be null")); + conv->Op()->SetAttr("Input_scale_" + ew_branch_add_in->Name(), + ew_branch_add_scale); + auto ew_branch_add_max_tensor = + scope->Var(ew_branch_add_max_name)->GetMutable(); + ew_branch_add_max_tensor->set_type(phi::DataType::FLOAT32); + ew_branch_add_max_tensor->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + std::vector ew_branch_add_scales(max_ptr_size, + ew_branch_add_scale); + memcpy(cpu_ctx->Alloc(ew_branch_add_max_tensor), + ew_branch_add_scales.data(), + max_ptr_size * sizeof(float)); + } + (*fusion_nodes_map)["branch_max"] = ew_branch_add_max; + } +} + +void Conv2dXPUFusePass::CreateFusionOutputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8, + std::string act_type) const { + auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); + PADDLE_ENFORCE_EQ( + conv != nullptr, + true, + platform::errors::InvalidArgument("conv node ptr can not be null")); + // output && output max + std::string conv2d_xpu_out_name; + Node* conv2d_out_op_node = nullptr; + Node* conv2d_out_var_node = nullptr; + + auto* ew_branch_add = + GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add"); + auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); + auto* ew_bias_add = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add"); + if (!act_type.empty()) { + auto* act_out = GetNodeFromNodesMap(nodes_map, "act", "act_out"); + PADDLE_ENFORCE_EQ( + act_out != nullptr, + true, + platform::errors::InvalidArgument("act_out node ptr can not be null")); + conv2d_xpu_out_name = act_out->Name(); + conv2d_out_var_node = act_out; + auto* act = GetNodeFromNodesMap(nodes_map, "act", "act"); + PADDLE_ENFORCE_EQ( + act != nullptr, + true, + platform::errors::InvalidArgument("act node ptr can not be null")); + conv2d_out_op_node = act; + } else if (ew_branch_add) { + auto* ew_branch_add_out = + GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_out"); + PADDLE_ENFORCE_EQ(ew_branch_add_out != nullptr, + true, + platform::errors::InvalidArgument( + "ew_branch_add_out node ptr can not be null")); + conv2d_xpu_out_name = ew_branch_add_out->Name(); + conv2d_out_var_node = ew_branch_add_out; + PADDLE_ENFORCE_EQ(ew_branch_add != nullptr, + true, + platform::errors::InvalidArgument( + "ew_branch_add node ptr can not be null")); + conv2d_out_op_node = ew_branch_add; + } else if (bn) { + auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out"); + PADDLE_ENFORCE_EQ( + bn_out != nullptr, + true, + platform::errors::InvalidArgument("bn_out node ptr can not be null")); + conv2d_xpu_out_name = bn_out->Name(); + conv2d_out_var_node = bn_out; + conv2d_out_op_node = bn; + } else if (ew_bias_add) { + auto* ew_bias_add_out = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out"); + PADDLE_ENFORCE_EQ(ew_bias_add_out != nullptr, + true, + platform::errors::InvalidArgument( + "ew_bias_add_out node ptr can not be null")); + conv2d_xpu_out_name = ew_bias_add_out->Name(); + conv2d_out_var_node = ew_bias_add_out; + conv2d_out_op_node = ew_bias_add; + } else { + auto* conv_out = GetNodeFromNodesMap(nodes_map, "conv", "conv_out"); + PADDLE_ENFORCE_EQ( + conv_out != nullptr, + true, + platform::errors::InvalidArgument("conv_out node ptr can not be null")); + conv2d_xpu_out_name = conv_out->Name(); + conv2d_out_var_node = conv_out; + auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); + PADDLE_ENFORCE_EQ( + conv != nullptr, + true, + platform::errors::InvalidArgument("conv node ptr can not be null")); + conv2d_out_op_node = conv; + } + (*fusion_nodes_map)["out"] = conv2d_out_var_node; + + // Create out max in + if (enable_int8) { + std::string conv_out_max_in_name = conv2d_xpu_out_name + "_max_in"; + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + VarDesc conv_out_max_in_desc(conv_out_max_in_name); + conv_out_max_in_desc.SetPersistable(true); + conv_out_max_in_desc.SetShape({static_cast(max_ptr_size)}); + conv_out_max_in_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + Node* conv2d_xpu_out_max_in = graph->CreateVarNode(&conv_out_max_in_desc); + auto* block_out_max_in_desc = block->Var(conv_out_max_in_name); + block_out_max_in_desc->SetPersistable(conv_out_max_in_desc.Persistable()); + block_out_max_in_desc->SetShape(conv_out_max_in_desc.GetShape()); + block_out_max_in_desc->SetDataType(conv_out_max_in_desc.GetDataType()); + + auto GetOutputScale = [&](Node* var_node, std::string name) -> float { + int nums_any_ops = var_node->outputs.size(); + for (size_t i = 0; i < nums_any_ops; ++i) { + auto* any_op_desc = conv2d_out_var_node->outputs[i]->Op(); + VLOG(1) << "any_op_desc: " << any_op_desc->Type(); + if (any_op_desc->HasAttr("Input_scale_" + name)) { + VLOG(1) << "find it: " + << "Input_scale_" + name; + return any_op_desc->GetAttrIfExists("Input_scale_" + name); + } + } + return 0; + }; + float output_scale = + GetOutputScale(conv2d_out_var_node, conv2d_xpu_out_name); + conv->Op()->SetAttr("Input_scale_" + conv2d_xpu_out_name, output_scale); + VLOG(1) << "conv2d_xpu_out_name:" << conv2d_xpu_out_name + << " output_scale: " << output_scale + << "conv2d_out_var_node name:" << conv2d_out_var_node->Name(); + phi::DenseTensor out_max_in_cpu_tensor; + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + out_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32); + out_max_in_cpu_tensor.Resize({max_ptr_size}); + std::vector output_scales(max_ptr_size, output_scale); + memcpy(cpu_ctx->Alloc(&out_max_in_cpu_tensor), + output_scales.data(), + max_ptr_size * sizeof(float)); + Assign(out_max_in_cpu_tensor, + scope->Var(conv_out_max_in_name)->GetMutable()); + (*fusion_nodes_map)["out_max_in"] = conv2d_xpu_out_max_in; + } + + // Create out max + std::string conv_out_max_name = conv2d_xpu_out_name + "_max"; + VarDesc conv_out_max_desc(conv_out_max_name); + Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv_out_max_desc); + (*fusion_nodes_map)["out_max"] = conv2d_xpu_out_max; +} + int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, const std::string& conv_type, const std::string& act_type, @@ -419,18 +992,22 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, with_scale, with_branch_x, with_branch_y); + auto* scope = param_scope(); + PADDLE_ENFORCE_NOT_NULL( + scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); + int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(4) << "handle Conv2dXPUFusePass fuse"; - /* declare operator node's name */ + std::map> nodes_map; GET_IR_NODE(conv); GET_IR_NODE(ew_bias_add); GET_IR_NODE(bn); GET_IR_NODE(scale); GET_IR_NODE(ew_branch_add); GET_IR_NODE(act); - /* declare variable node's name*/ + /* Get variable node's name*/ GET_IR_NODE(input); GET_IR_NODE(conv_filter); GET_IR_NODE(conv_out); @@ -449,167 +1026,140 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, GET_IR_NODE(ew_branch_add_in); GET_IR_NODE(ew_branch_add_out); GET_IR_NODE(act_out); + + nodes_map.insert({"conv", + {{"conv", conv}, + {"conv_filter", conv_filter}, + {"input", input}, + {"conv_out", conv_out}}}); + nodes_map.insert({"ew_bias_add", + {{"ew_bias_add", ew_bias_add}, + {"ew_bias_add_y", ew_bias_add_y}, + {"ew_bias_add_out", ew_bias_add_out}}}); + nodes_map.insert({"bn", + {{"bn", bn}, + {"bn_bias", bn_bias}, + {"bn_mean", bn_mean}, + {"bn_scale", bn_scale}, + {"bn_var", bn_var}, + {"bn_out", bn_out}, + {"bn_var_out", bn_var_out}, + {"bn_mean_out", bn_mean_out}, + {"bn_saved_var", bn_saved_var}, + {"bn_saved_mean", bn_saved_mean}}}); + nodes_map.insert({"scale", {{"scale", scale}, {"scale_out", scale_out}}}); + nodes_map.insert({"ew_branch_add", + {{"ew_branch_add", ew_branch_add}, + {"ew_branch_add_in", ew_branch_add_in}, + {"ew_branch_add_out", ew_branch_add_out}}}); + nodes_map.insert({"act", {{"act", act}, {"act_out", act_out}}}); + + std::map fusion_nodes_map{{"x", nullptr}, + {"x_max", nullptr}, + {"filter", nullptr}, + {"filter_max", nullptr}, + {"bias", nullptr}, + {"branch", nullptr}, + {"branch_max", nullptr}, + {"scale_max", nullptr}, + {"out_max_in", nullptr}, + {"out", nullptr}, + {"out_max", nullptr}}; + + bool enable_int8 = conv->Op()->GetAttrIfExists("enable_int8"); + std::string op_precision_str = enable_int8 ? "int8" : "fp32"; + VLOG(4) << "Conv2d fusion fuse pass is running on " << op_precision_str + << " precision!"; auto* block = conv->Op()->Block(); - auto* scope = param_scope(); - PADDLE_ENFORCE_NOT_NULL( - scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - - // recompute bias and weight for conv2d_xpu op - auto* filter_t = - scope->FindVar(conv_filter->Name())->GetMutable(); - // conv_filter fp16 --> fp32 - auto filter_len = filter_t->numel(); - auto filter_dtype = filter_t->dtype(); - int out_dtype = proto::VarType::Type::VarType_Type_FP32; - if (filter_dtype == phi::DataType::FLOAT16) { - out_dtype = proto::VarType::Type::VarType_Type_FP16; - CastToFp32(filter_t, nullptr); - } - - auto filter_dims = filter_t->dims(); - bool has_bias = with_bn || with_conv_bias; - // Create conv_fusion_bias (conv bias) variable - Node* fusion_bias_node = nullptr; - if (has_bias) { - if (with_conv_bias) { - auto* ew_bias_add_y_t = scope->FindVar(ew_bias_add_y->Name()) - ->GetMutable(); - auto ew_bias_add_y_dims = ew_bias_add_y_t->dims(); - PADDLE_ENFORCE_EQ(filter_dims[0], - ew_bias_add_y_dims[0], - platform::errors::InvalidArgument( - "the shape[%d] of elewise bias tensor " - "must equal out_channel[%d] of conv", - ew_bias_add_y_dims[0], - filter_dims[0])); - PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node); - } - if (with_bn) { - auto bn_bias_t = - scope->Var(bn_bias->Name())->GetMutable(); - PADDLE_ENFORCE_EQ(filter_dims[0], - bn_bias_t->dims()[0], - platform::errors::InvalidArgument( - "the shape[%d] of bn bias tensor " - "must equal out_channel[%d] of conv", - bn_bias_t->dims()[0], - filter_dims[0])); - auto bn_scale_t = - scope->Var(bn_scale->Name())->GetMutable(); - auto bn_mean_t = - scope->Var(bn_mean->Name())->GetMutable(); - auto bn_var_t = - scope->Var(bn_var->Name())->GetMutable(); - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_scale_ptr = - bn_scale_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_bias_ptr = - bn_bias_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_mean_ptr = - bn_mean_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_var_ptr = - bn_var_t->mutable_data(paddle::platform::CPUPlace()); - auto mean_len = bn_mean_t->numel(); - auto filter_stride = filter_len / mean_len; - float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); - if (!with_conv_bias) { // prev node is conv - PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node); - } - auto fusion_bias_t = scope->Var(fusion_bias_node->Name()) - ->GetMutable(); - float* fusion_bias_ptr = - fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); - // recompute bias and weights - if (!with_conv_bias) { // prev node is conv - for (int i = 0; i < mean_len; ++i) { - bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); - fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i]; - for (int j = 0; j < filter_stride; j++) { - filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; - } - } - } else { - for (int i = 0; i < mean_len; ++i) { - bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); - fusion_bias_ptr[i] = - bn_bias_ptr[i] + - (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i]; - for (int j = 0; j < filter_stride; j++) { - filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; - } - } - } - } - } - // deal with scale op - if (with_scale) { - auto bias_len = filter_dims[0]; - float scale_val_ = 1.f; - float bias_val_ = 0.f; - scale_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale")); - bias_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("bias")); - bool bias_after_scale_ = - PADDLE_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale")); - // recompute bias as scale op - auto fusion_bias_t = scope->GetVar(fusion_bias_node->Name()) - ->GetMutable(); - float* fusion_bias_ptr = - fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); - for (int i = 0; i < bias_len; ++i) { - if (bias_after_scale_) { - fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_; - } else { - fusion_bias_ptr[i] = (fusion_bias_ptr[i] + bias_val_) * scale_val_; - } - } - // recompute weight as scale op - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); - for (int i = 0; i < filter_len; ++i) { - filter_ptr[i] *= scale_val_; - } - } - // filter max - Node* filter_int16 = nullptr; - Node* filter_max = nullptr; - PrepareWeight( - graph, scope, block, conv_filter, &filter_int16, &filter_max, false); - // output && output max - std::string conv2d_xpu_out_name; - if (!act_type.empty()) { - conv2d_xpu_out_name = act_out->Name(); - } else if (ew_branch_add) { - conv2d_xpu_out_name = ew_branch_add_out->Name(); - } else if (scale) { - conv2d_xpu_out_name = scale_out->Name(); - } else if (bn) { - conv2d_xpu_out_name = bn_out->Name(); - } else if (ew_bias_add) { - conv2d_xpu_out_name = ew_bias_add_out->Name(); - } else { - conv2d_xpu_out_name = conv_out->Name(); - } - std::string conv2d_xpu_out_max_name = conv2d_xpu_out_name + "_max"; - VarDesc conv2d_xpu_out_max_desc(conv2d_xpu_out_max_name); - Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv2d_xpu_out_max_desc); + CreateFusionWeightsAndBias(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + with_conv_bias, + with_bn, + with_scale, + enable_int8); + VLOG(1) << "CreateFusionWeightsAndBias success!"; + CreateFusionInputs( + graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); + VLOG(1) << "CreateFusionInputs success!"; + CreateFusionBranch( + graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); + VLOG(1) << "CreateFusionBranch success!"; + CreateFusionOutputs(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + enable_int8, + act_type); + VLOG(1) << "CreateFusionOutputs success!"; + // int out_dtype = PADDLE_GET_CONST(int, conv->Op()->GetAttr("out_dtype")); + // if (out_dtype == proto::VarType::Type::VarType_Type_INT8) { + // fusion_nodes_map["out"]->Var()->SetDataType( + // proto::VarType::Type::VarType_Type_INT8); + // if (fusion_nodes_map["branch"]) { + // fusion_nodes_map["branch"]->Var()->SetDataType( + // proto::VarType::Type::VarType_Type_INT8); + // } + // } // Generate conv2d_xpu op framework::OpDesc conv2d_xpu_op_desc(block); + for (auto [first, second] : fusion_nodes_map) { + VLOG(1) << "first: " << first << " second: " << second; + if (first == "x" || first == "out" || first == "out_max" || + first == "branch") + continue; + if (second != nullptr) { + auto* temp_tensor = + scope->FindVar(second->Name())->GetMutable(); + VLOG(1) << *temp_tensor; + } + } // set input&output var conv2d_xpu_op_desc.SetType("conv2d_xpu"); - conv2d_xpu_op_desc.SetInput("x", {input->Name()}); - conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()}); - conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()}); - conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name}); - conv2d_xpu_op_desc.SetOutput("out_max", {conv2d_xpu_out_max_name}); - // set fusion_bias input node - if (has_bias) { - conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()}); + conv2d_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()}); + if (fusion_nodes_map["x_max"]) { + conv2d_xpu_op_desc.SetInput("x_max", {fusion_nodes_map["x_max"]->Name()}); + } + conv2d_xpu_op_desc.SetInput("filter", {fusion_nodes_map["filter"]->Name()}); + conv2d_xpu_op_desc.SetInput("filter_max", + {fusion_nodes_map["filter_max"]->Name()}); + if (fusion_nodes_map["scale_max"]) { + conv2d_xpu_op_desc.SetInput("scale_max", + {fusion_nodes_map["scale_max"]->Name()}); + } + if (fusion_nodes_map["out_max_in"]) { + conv2d_xpu_op_desc.SetInput("out_max_in", + {fusion_nodes_map["out_max_in"]->Name()}); + } + conv2d_xpu_op_desc.SetOutput("out", {fusion_nodes_map["out"]->Name()}); + conv2d_xpu_op_desc.SetOutput("out_max", + {fusion_nodes_map["out_max"]->Name()}); + if (with_conv_bias || with_bn) { + PADDLE_ENFORCE_EQ( + fusion_nodes_map["bias"] != nullptr, + true, + platform::errors::InvalidArgument( + "fusion_nodes_map['bias'] node ptr can not be null")); + conv2d_xpu_op_desc.SetInput("bias", {fusion_nodes_map["bias"]->Name()}); } // set ew_branch_add input node if (ew_branch_add != nullptr) { - conv2d_xpu_op_desc.SetInput("branch", {ew_branch_add_in->Name()}); + PADDLE_ENFORCE_EQ( + fusion_nodes_map["branch"] != nullptr, + true, + platform::errors::InvalidArgument( + "fusion_nodes_map['branch'] node ptr can not be null")); + conv2d_xpu_op_desc.SetInput("branch", + {fusion_nodes_map["branch"]->Name()}); + if (fusion_nodes_map["branch_max"]) { + conv2d_xpu_op_desc.SetInput("branch_max", + {fusion_nodes_map["branch_max"]->Name()}); + } } + VLOG(1) << "creat conv2d_xpu_op_desc success!"; // set attrs of conv2d_xpu float act_param_ = 0.0f; if (!act_type.empty()) { @@ -646,57 +1196,73 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, "strides", PADDLE_GET_CONST(std::vector, conv->Op()->GetAttr("strides"))); conv2d_xpu_op_desc.SetAttr("paddings", conv_paddings); - conv2d_xpu_op_desc.SetAttr("out_dtype", out_dtype); + // out_dtype is same to input precision + conv2d_xpu_op_desc.SetAttr("out_dtype", + fusion_nodes_map["x"]->Var()->GetDataType()); + conv2d_xpu_op_desc.SetAttr( + "enable_int8", conv->Op()->GetAttrIfExists("enable_int8")); + if (enable_int8) { + conv2d_xpu_op_desc.SetAttr( + "Input_scale_" + fusion_nodes_map["out"]->Name(), + conv->Op()->GetAttrIfExists("Input_scale_" + + fusion_nodes_map["out"]->Name())); + conv2d_xpu_op_desc.SetAttr( + "Input_scale_" + fusion_nodes_map["x"]->Name(), + conv->Op()->GetAttrIfExists("Input_scale_" + + fusion_nodes_map["x"]->Name())); + if (fusion_nodes_map["branch"]) { + conv2d_xpu_op_desc.SetAttr( + "Input_scale_" + fusion_nodes_map["branch"]->Name(), + conv->Op()->GetAttrIfExists( + "Input_scale_" + fusion_nodes_map["branch"]->Name())); + } + } + VLOG(1) << "Set attr success!"; + // Link node auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc); - IR_NODE_LINK_TO(input, conv2d_xpu); - IR_NODE_LINK_TO(filter_int16, conv2d_xpu); - IR_NODE_LINK_TO(filter_max, conv2d_xpu); - if (ew_bias_add || bn) { - SAFE_IR_NODE_LINK_TO(fusion_bias_node, conv2d_xpu); - } - if (ew_branch_add_in) { - IR_NODE_LINK_TO(ew_branch_add_in, conv2d_xpu); - } - if (act_out) { - IR_NODE_LINK_TO(conv2d_xpu, act_out); - } else if (ew_branch_add_out) { - IR_NODE_LINK_TO(conv2d_xpu, ew_branch_add_out); - } else if (scale_out) { - IR_NODE_LINK_TO(conv2d_xpu, scale_out); - } else if (bn_out) { - IR_NODE_LINK_TO(conv2d_xpu, bn_out); - } else if (ew_bias_add_out) { - IR_NODE_LINK_TO(conv2d_xpu, ew_bias_add_out); - } else { - IR_NODE_LINK_TO(conv2d_xpu, conv_out); + IR_NODE_LINK_TO(fusion_nodes_map["x"], conv2d_xpu); + if (fusion_nodes_map["x_max"]) { + IR_NODE_LINK_TO(fusion_nodes_map["x_max"], conv2d_xpu); } - IR_NODE_LINK_TO(conv2d_xpu, conv2d_xpu_out_max); - // delete useless node - std::unordered_set delete_nodes = {conv}; - if (act != nullptr) { - delete_nodes.insert(act); + IR_NODE_LINK_TO(fusion_nodes_map["filter"], conv2d_xpu); + IR_NODE_LINK_TO(fusion_nodes_map["filter_max"], conv2d_xpu); + if (fusion_nodes_map["scale_max"]) { + IR_NODE_LINK_TO(fusion_nodes_map["scale_max"], conv2d_xpu); } - if (ew_branch_add != nullptr) { - delete_nodes.insert(ew_branch_add); + if (fusion_nodes_map["bias"]) { + SAFE_IR_NODE_LINK_TO(fusion_nodes_map["bias"], conv2d_xpu); + } + if (fusion_nodes_map["branch"]) { + IR_NODE_LINK_TO(fusion_nodes_map["branch"], conv2d_xpu); + } + if (fusion_nodes_map["branch_max"]) { + IR_NODE_LINK_TO(fusion_nodes_map["branch_max"], conv2d_xpu); + } + if (fusion_nodes_map["out_max_in"]) { + IR_NODE_LINK_TO(fusion_nodes_map["out_max_in"], conv2d_xpu); + } + IR_NODE_LINK_TO(conv2d_xpu, fusion_nodes_map["out"]); + IR_NODE_LINK_TO(conv2d_xpu, fusion_nodes_map["out_max"]); + // delete useless node + std::unordered_set delete_nodes; + if (conv != nullptr) { + delete_nodes.insert(conv); } if (scale != nullptr) { delete_nodes.insert(scale); } if (bn != nullptr) { delete_nodes.insert(bn); - delete_nodes.insert(bn_bias); - delete_nodes.insert(bn_var); - delete_nodes.insert(bn_mean); - delete_nodes.insert(bn_scale); - delete_nodes.insert(bn_var_out); - delete_nodes.insert(bn_mean_out); - delete_nodes.insert(bn_saved_var); - delete_nodes.insert(bn_saved_mean); } if (ew_bias_add != nullptr) { delete_nodes.insert(ew_bias_add); - delete_nodes.insert(ew_bias_add_y); + } + if (ew_branch_add != nullptr) { + delete_nodes.insert(ew_branch_add); + } + if (act != nullptr) { + delete_nodes.insert(act); } GraphSafeRemoveNodes(graph, delete_nodes); found_subgraph_count++; diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 4c8424b7df08f..5868db5627021 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -244,9 +244,62 @@ class FcXPUFusePass : public FusePassBase { bool with_bn, const std::string& act_type) const; + void CreateFusionWeightsAndBias( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + std::string mul_type, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool with_bias, + bool with_bn, + bool enable_int8) const; + + void CreateFusionOutputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const; + + void CreateFusionInputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const; + + Node* GetNodeFromNodesMap( + const std::map>& nodes_map, + std::string pattern_node_name, + std::string node_name) const; + const std::string name_scope_{"fc_xpu_fuse_pass"}; }; +Node* FcXPUFusePass::GetNodeFromNodesMap( + const std::map>& nodes_map, + std::string pattern_node_name, + std::string node_name) const { + auto iter = nodes_map.find(pattern_node_name); + PADDLE_ENFORCE_EQ( + iter != nodes_map.end(), + true, + platform::errors::InvalidArgument("nodes_map[%s] not found in nodes_map", + pattern_node_name.c_str())); + auto node_map = iter->second; + auto node_iter = node_map.find(node_name); + PADDLE_ENFORCE_EQ(node_iter != node_map.end(), + true, + platform::errors::InvalidArgument( + "nodes_map[%s][%s] not found in nodes_map", + pattern_node_name.c_str(), + node_name.c_str())); + return node_iter->second; +} + void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::PreconditionNotMet("graph should not be null.")); @@ -275,6 +328,395 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_subgraph_count); } +void FcXPUFusePass::CreateFusionWeightsAndBias( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + std::string mul_type, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool with_bias, + bool with_bn, + bool enable_int8) const { + // Get Node + auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); + PADDLE_ENFORCE_EQ( + mul != nullptr, + true, + platform::errors::InvalidArgument("mul node ptr can not be null")); + auto* mul_w = GetNodeFromNodesMap(nodes_map, "mul", "mul_w"); + PADDLE_ENFORCE_EQ( + mul_w != nullptr, + true, + platform::errors::InvalidArgument("mul_w node ptr can not be null")); + + // transfilter fp16 --> fp32 + auto* filter_t = + scope->FindVar(mul_w->Name())->GetMutable(); + auto filter_len = filter_t->numel(); + auto filter_dtype = filter_t->dtype(); + if (filter_dtype == phi::DataType::FLOAT16) { + CastToFp32(filter_t, nullptr); + } + + bool transpose_w = false; + if (mul_type == "matmul") { + transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y")); + } else if (mul_type == "matmul_v2") { + transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y")); + } + // Get Weight scale in int8 scene + std::vector weight_scale = + mul->Op()->GetAttrIfExists>("Input_scale_" + + mul_w->Name()); + // Create fusion_bias_node + auto filter_dims = filter_t->dims(); + bool has_bias = with_bn || with_bias; + Node* fusion_bias_node = nullptr; + if (with_bias) { + auto* ew_bias_add_bias = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_bias"); + PADDLE_ENFORCE_EQ(ew_bias_add_bias != nullptr, + true, + platform::errors::InvalidArgument( + "ew_bias_add_bias node ptr can not be null")); + auto* ew_bias_add_bias_t = scope->FindVar(ew_bias_add_bias->Name()) + ->GetMutable(); + PrepareBias(graph, scope, block, ew_bias_add_bias, &fusion_bias_node); + } + + if (with_bn) { + auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); + PADDLE_ENFORCE_EQ( + bn != nullptr, + true, + platform::errors::InvalidArgument("bn node ptr can not be null")); + auto* bn_bias = GetNodeFromNodesMap(nodes_map, "bn", "bn_bias"); + PADDLE_ENFORCE_EQ( + bn_bias != nullptr, + true, + platform::errors::InvalidArgument("bn_bias node ptr can not be null")); + auto* bn_scale = GetNodeFromNodesMap(nodes_map, "bn", "bn_scale"); + PADDLE_ENFORCE_EQ( + bn_scale != nullptr, + true, + platform::errors::InvalidArgument("bn_scale node ptr can not be null")); + auto* bn_var = GetNodeFromNodesMap(nodes_map, "bn", "bn_var"); + PADDLE_ENFORCE_EQ( + bn_var != nullptr, + true, + platform::errors::InvalidArgument("bn_var node ptr can not be null")); + auto* bn_mean = GetNodeFromNodesMap(nodes_map, "bn", "bn_mean"); + PADDLE_ENFORCE_EQ( + bn_mean != nullptr, + true, + platform::errors::InvalidArgument("bn_mean node ptr can not be null")); + + auto bn_bias_t = + scope->Var(bn_bias->Name())->GetMutable(); + PADDLE_ENFORCE_EQ( + filter_dims[0], + bn_bias_t->dims()[0], + platform::errors::InvalidArgument("the shape[%d] of bn bias tensor " + "must equal out_channel[%d] of conv", + bn_bias_t->dims()[0], + filter_dims[0])); + auto bn_scale_t = + scope->Var(bn_scale->Name())->GetMutable(); + auto bn_mean_t = + scope->Var(bn_mean->Name())->GetMutable(); + auto bn_var_t = scope->Var(bn_var->Name())->GetMutable(); + float* bn_scale_ptr = + bn_scale_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_bias_ptr = + bn_bias_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_mean_ptr = + bn_mean_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_var_ptr = + bn_var_t->mutable_data(paddle::platform::CPUPlace()); + auto mean_len = bn_mean_t->numel(); + auto filter_stride = filter_len / mean_len; + float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); + if (!with_bias) { // prev node is conv + PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node); + } + + auto fusion_bias_t = + scope->Var(fusion_bias_node->Name())->GetMutable(); + float* fusion_bias_ptr = + fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + // recompute bias and weights + for (int i = 0; i < mean_len; ++i) { + bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); + } + // recompute the weights + if (!enable_int8) { + float* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + for (int i = 0; i < mean_len; ++i) { + for (int j = 0; j < filter_stride; j++) { + filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; + } + } + } else { + int8_t* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + PADDLE_ENFORCE_EQ( + weight_scale.size(), + mean_len, + platform::errors::InvalidArgument( + "Weight max_scale size must equal batch_norm sacle/mean size.")); + for (int i = 0; i < mean_len; i++) { + weight_scale[i] *= fabs(bn_scale_ptr[i]); + } + for (int i = 0; i < mean_len; i++) { + if (bn_scale_ptr[i] < 0) { + for (int j = 0; j < filter_stride; ++j) { + filter_ptr[i * filter_stride + j] *= -1; + } + } + } + } + // recompute bias + if (!with_bias) { + for (int i = 0; i < mean_len; ++i) { + fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i]; + } + } else { + for (int i = 0; i < mean_len; ++i) { + fusion_bias_ptr[i] = + bn_bias_ptr[i] + + (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i]; + } + } + } + + (*fusion_nodes_map)["bias"] = fusion_bias_node; + + Node* filter_intx = nullptr; + Node* filter_max = nullptr; + Node* scale_max = nullptr; + if (!enable_int8) { + PrepareWeight(graph, + scope, + block, + mul_w, + &filter_intx, + &filter_max, + !transpose_w, + weight_scale); + } else { + PrepareWeight(graph, + scope, + block, + mul_w, + &filter_intx, + &filter_max, + !transpose_w, + weight_scale); + } + + bool is_per_channel_need_create_scale_max_node = + !weight_scale.empty() && !IsPerTensorQuant(weight_scale); + if (is_per_channel_need_create_scale_max_node) { + phi::DenseTensor ones_weight_max_tensor; + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + int max_ptr_size = weight_scale.empty() + ? phi::backends::xpu::get_xpu_max_ptr_size(-1) + : weight_scale.size(); + ones_weight_max_tensor.set_type(phi::DataType::FLOAT32); + ones_weight_max_tensor.Resize({max_ptr_size}); + std::vector ones_weight(max_ptr_size, 1.0); + memcpy(cpu_ctx->Alloc(&ones_weight_max_tensor), + ones_weight.data(), + max_ptr_size * sizeof(float)); + + std::string scale_max_name = mul_w->Name() + "_scale_max"; + VarDesc scale_max_desc(scale_max_name); + scale_max_desc.SetPersistable(true); + scale_max_desc.SetShape(vectorize(ones_weight_max_tensor.dims())); + scale_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + scale_max = graph->CreateVarNode(&scale_max_desc); + auto* block_scale_max_desc = block->Var(scale_max_name); + block_scale_max_desc->SetPersistable(scale_max_desc.Persistable()); + block_scale_max_desc->SetShape(scale_max_desc.GetShape()); + block_scale_max_desc->SetDataType(scale_max_desc.GetDataType()); + Assign(ones_weight_max_tensor, + scope->Var(scale_max_name)->GetMutable()); + } + + (*fusion_nodes_map)["w"] = filter_intx; + if (is_per_channel_need_create_scale_max_node) { + (*fusion_nodes_map)["w_max"] = scale_max; + (*fusion_nodes_map)["scale_max"] = filter_max; + } else { + (*fusion_nodes_map)["w_max"] = filter_max; + (*fusion_nodes_map)["scale_max"] = scale_max; + } +} + +void FcXPUFusePass::CreateFusionOutputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const { + auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); + PADDLE_ENFORCE_EQ( + mul != nullptr, + true, + platform::errors::InvalidArgument("mul node ptr can not be null")); + // output && output max + std::string fc_xpu_out_name; + Node* fc_out_op_node = nullptr; + Node* fc_out_var_node = nullptr; + + auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); + auto* ew_bias_add = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add"); + auto* act = GetNodeFromNodesMap(nodes_map, "act", "act"); + if (act) { + auto* act_out = GetNodeFromNodesMap(nodes_map, "act", "act_out"); + PADDLE_ENFORCE_EQ( + act_out != nullptr, + true, + platform::errors::InvalidArgument("act_out node ptr can not be null")); + fc_xpu_out_name = act_out->Name(); + fc_out_var_node = act_out; + fc_out_op_node = act; + } else if (bn) { + auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out"); + PADDLE_ENFORCE_EQ( + bn_out != nullptr, + true, + platform::errors::InvalidArgument("bn_out node ptr can not be null")); + fc_xpu_out_name = bn_out->Name(); + fc_out_var_node = bn_out; + fc_out_op_node = bn; + } else if (ew_bias_add) { + auto* ew_bias_add_out = + GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out"); + PADDLE_ENFORCE_EQ(ew_bias_add_out != nullptr, + true, + platform::errors::InvalidArgument( + "ew_bias_add_out node ptr can not be null")); + fc_xpu_out_name = ew_bias_add_out->Name(); + fc_out_var_node = ew_bias_add_out; + fc_out_op_node = ew_bias_add; + } else { + auto* mul_out = GetNodeFromNodesMap(nodes_map, "mul", "mul_out"); + PADDLE_ENFORCE_EQ( + mul_out != nullptr, + true, + platform::errors::InvalidArgument("mul_out node ptr can not be null")); + fc_xpu_out_name = mul_out->Name(); + fc_out_var_node = mul_out; + fc_out_op_node = mul; + } + (*fusion_nodes_map)["out"] = fc_out_var_node; + + // Create out max in + if (enable_int8) { + std::string fc_out_max_in_name = fc_xpu_out_name + "_max_in"; + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + VarDesc fc_out_max_in_desc(fc_out_max_in_name); + fc_out_max_in_desc.SetPersistable(true); + fc_out_max_in_desc.SetShape({static_cast(max_ptr_size)}); + fc_out_max_in_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + Node* fc_xpu_out_max_in = graph->CreateVarNode(&fc_out_max_in_desc); + auto* block_out_max_in_desc = block->Var(fc_out_max_in_name); + block_out_max_in_desc->SetPersistable(fc_out_max_in_desc.Persistable()); + block_out_max_in_desc->SetShape(fc_out_max_in_desc.GetShape()); + block_out_max_in_desc->SetDataType(fc_out_max_in_desc.GetDataType()); + + auto GetOutputScale = [&](Node* var_node, std::string name) -> float { + int nums_any_ops = var_node->outputs.size(); + for (size_t i = 0; i < nums_any_ops; ++i) { + auto* any_op_desc = fc_out_var_node->outputs[i]->Op(); + VLOG(1) << "any_op_desc: " << any_op_desc->Type(); + if (any_op_desc->HasAttr("Input_scale_" + name)) { + VLOG(1) << "find it: " + << "Input_scale_" + name; + return any_op_desc->GetAttrIfExists("Input_scale_" + name); + } + } + return 0; + }; + float output_scale = GetOutputScale(fc_out_var_node, fc_xpu_out_name); + mul->Op()->SetAttr("Input_scale_" + fc_xpu_out_name, output_scale); + VLOG(1) << "fc_xpu_out_name:" << fc_xpu_out_name + << " output_scale: " << output_scale + << "fc_out_var_node name:" << fc_out_var_node->Name(); + phi::DenseTensor out_max_in_cpu_tensor; + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + out_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32); + out_max_in_cpu_tensor.Resize({max_ptr_size}); + std::vector output_scales(max_ptr_size, output_scale); + memcpy(cpu_ctx->Alloc(&out_max_in_cpu_tensor), + output_scales.data(), + max_ptr_size * sizeof(float)); + Assign(out_max_in_cpu_tensor, + scope->Var(fc_out_max_in_name)->GetMutable()); + (*fusion_nodes_map)["out_max_in"] = fc_xpu_out_max_in; + } + + // Create out max + std::string fc_out_max_name = fc_xpu_out_name + "_max"; + VarDesc fc_out_max_desc(fc_out_max_name); + Node* fc_xpu_out_max = graph->CreateVarNode(&fc_out_max_desc); + (*fusion_nodes_map)["out_max"] = fc_xpu_out_max; +} + +void FcXPUFusePass::CreateFusionInputs( + ir::Graph* graph, + Scope* scope, + BlockDesc* block, + const std::map>& nodes_map, + std::map* fusion_nodes_map, + bool enable_int8) const { + // Get Node + auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); + PADDLE_ENFORCE_EQ( + mul != nullptr, + true, + platform::errors::InvalidArgument("mul node ptr can not be null")); + auto* mul_x = GetNodeFromNodesMap(nodes_map, "mul", "mul_x"); + PADDLE_ENFORCE_EQ( + mul_x != nullptr, + true, + platform::errors::InvalidArgument("mul_x node ptr can not be null")); + // x max + std::string mul_x_max_name = mul_x->Name() + "_max"; + Node* mul_x_max = nullptr; + if (enable_int8) { + float input_scale = + mul->Op()->GetAttrIfExists("Input_scale_" + mul_x->Name()); + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + VarDesc x_max_desc(mul_x_max_name); + x_max_desc.SetPersistable( + true); // Need depends on ir_params_sync_among_devices_pass copy to xpu + // device + x_max_desc.SetShape({static_cast(max_ptr_size)}); + x_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + mul_x_max = graph->CreateVarNode(&x_max_desc); + auto input_max_tensor = + scope->Var(mul_x_max_name)->GetMutable(); + input_max_tensor->set_type(phi::DataType::FLOAT32); + input_max_tensor->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + std::vector input_scales(max_ptr_size, input_scale); + memcpy(cpu_ctx->Alloc(input_max_tensor), + input_scales.data(), + max_ptr_size * sizeof(float)); + } + (*fusion_nodes_map)["x"] = mul_x; + (*fusion_nodes_map)["x_max"] = mul_x_max; +} + int FcXPUFusePass::ApplyImpl(ir::Graph* graph, const std::string& mul_type, bool with_bias, @@ -287,7 +729,7 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, with_bias, with_bn, act_type); - + auto* scope = param_scope(); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -311,108 +753,78 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, GET_IR_NODE(bn_saved_mean); GET_IR_NODE(act); GET_IR_NODE(act_out); - auto* block = mul->Op()->Block(); - auto* scope = param_scope(); - - auto* filter_t = - scope->FindVar(mul_w->Name())->GetMutable(); - // weight fp16 --> fp32 - auto filter_dtype = filter_t->dtype(); - int out_dtype = proto::VarType::Type::VarType_Type_FP32; - if (filter_dtype == phi::DataType::FLOAT16) { - out_dtype = proto::VarType::Type::VarType_Type_FP16; - CastToFp32(filter_t, nullptr); - } - auto filter_dims = filter_t->dims(); + std::map> nodes_map; + nodes_map.insert( + {"mul", {{"mul_x", mul_x}, {"mul_w", mul_w}, {"mul_out", mul_out}}}); + nodes_map.insert({"ew_bias_add", + {{"ew_bias_add", add}, + {"ew_bias_add_bias", bias}, + {"ew_bias_add_out", add_out}}}); + nodes_map.insert({"bn", + {{"bn", bn}, + {"bn_bias", bn_bias}, + {"bn_mean", bn_mean}, + {"bn_scale", bn_scale}, + {"bn_var", bn_var}, + {"bn_out", bn_out}, + {"bn_var_out", bn_var_out}, + {"bn_mean_out", bn_mean_out}, + {"bn_saved_var", bn_saved_var}, + {"bn_saved_mean", bn_saved_mean}}}); + nodes_map.insert({"act", {{"act", act}, {"act_out", act_out}}}); - bool transpose_w = false; - if (mul_type == "matmul") { - transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y")); - } else if (mul_type == "matmul_v2") { - transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y")); - } + std::map fusion_nodes_map{{"x", nullptr}, + {"x_max", nullptr}, + {"w", nullptr}, + {"w_max", nullptr}, + {"bias", nullptr}, + {"scale_max", nullptr}, + {"out_max_in", nullptr}, + {"out", nullptr}, + {"out_max", nullptr}}; - bool has_bias = with_bn || with_bias; - Node* fusion_bias_node = nullptr; - if (has_bias) { - if (bias != nullptr) { - PrepareBias(graph, scope, block, bias, &fusion_bias_node); - } - if (bn != nullptr) { - auto bn_bias_t = - scope->Var(bn_bias->Name())->GetMutable(); - auto bn_scale_t = - scope->Var(bn_scale->Name())->GetMutable(); - auto bn_mean_t = - scope->Var(bn_mean->Name())->GetMutable(); - auto bn_var_t = - scope->Var(bn_var->Name())->GetMutable(); - float* mul_w_ptr = filter_t->data(); - float* bn_scale_ptr = bn_scale_t->data(); - float* bn_bias_ptr = bn_bias_t->data(); - float* bn_mean_ptr = bn_mean_t->data(); - float* bn_var_ptr = bn_var_t->data(); - auto mean_len = bn_mean_t->numel(); - auto filter_h = filter_dims[0]; - auto filter_w = filter_dims[1]; - float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); - if (fusion_bias_node == nullptr) { // prev node is conv - PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node); - } - auto fusion_bias_t = scope->Var(fusion_bias_node->Name()) - ->GetMutable(); - float* fusion_bias_ptr = fusion_bias_t->data(); - // recompute bias and weights - if (bias == nullptr) { - for (int i = 0; i < mean_len; ++i) { - bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); - fusion_bias_ptr[i] += (0.f - bn_mean_ptr[i]) * bn_scale_ptr[i]; - for (int j = 0; j < filter_h; j++) { - mul_w_ptr[j * filter_w + i] *= bn_scale_ptr[i]; - } - } - } else { - for (int i = 0; i < mean_len; ++i) { - bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); - bn_bias_ptr[i] += - (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i]; - for (int j = 0; j < filter_h; j++) { - mul_w_ptr[j * filter_w + i] *= bn_scale_ptr[i]; - } - } - memcpy(fusion_bias_ptr, bn_bias_ptr, mean_len * sizeof(float)); - } - } - } - - Node* mul_w_int16 = nullptr; - Node* mul_w_max = nullptr; - PrepareWeight( - graph, scope, block, mul_w, &mul_w_int16, &mul_w_max, !transpose_w); - - std::string fc_out_name; - if (act_out) { - fc_out_name = act_out->Name(); - } else if (bn) { - fc_out_name = bn_out->Name(); - } else if (add_out) { - fc_out_name = add_out->Name(); - } else { - fc_out_name = mul_out->Name(); - } - std::string fc_out_max_name = fc_out_name + "_max"; - VarDesc fc_out_max_desc(fc_out_max_name); - Node* fc_out_max = graph->CreateVarNode(&fc_out_max_desc); + bool enable_int8 = mul->Op()->GetAttrIfExists("enable_int8"); + std::string op_precision_str = enable_int8 ? "int8" : "fp32"; + VLOG(4) << "FC fusion fuse pass is running on " << op_precision_str + << " precision!"; + auto* block = mul->Op()->Block(); + CreateFusionWeightsAndBias(graph, + scope, + block, + mul_type, + nodes_map, + &fusion_nodes_map, + with_bias, + with_bn, + enable_int8); + CreateFusionInputs( + graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); + CreateFusionOutputs( + graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); + VLOG(1) << "CreateFusionOutputs success!"; // Generate fc_xpu op framework::OpDesc fc_xpu_op_desc(block); fc_xpu_op_desc.SetType("fc_xpu"); - fc_xpu_op_desc.SetInput("x", {mul_x->Name()}); - fc_xpu_op_desc.SetInput("w", {mul_w_int16->Name()}); - fc_xpu_op_desc.SetInput("w_max", {mul_w_max->Name()}); - if (has_bias) { - fc_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()}); + fc_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()}); + if (fusion_nodes_map["x_max"]) { + fc_xpu_op_desc.SetInput("x_max", {fusion_nodes_map["x_max"]->Name()}); + } + fc_xpu_op_desc.SetInput("w", {fusion_nodes_map["w"]->Name()}); + fc_xpu_op_desc.SetInput("w_max", {fusion_nodes_map["w_max"]->Name()}); + if (fusion_nodes_map["bias"]) { + fc_xpu_op_desc.SetInput("bias", {fusion_nodes_map["bias"]->Name()}); + } + if (fusion_nodes_map["scale_max"]) { + fc_xpu_op_desc.SetInput("scale_max", + {fusion_nodes_map["scale_max"]->Name()}); + } + if (fusion_nodes_map["out_max_in"]) { + fc_xpu_op_desc.SetInput("out_max_in", + {fusion_nodes_map["out_max_in"]->Name()}); } + fc_xpu_op_desc.SetOutput("out", {fusion_nodes_map["out"]->Name()}); + fc_xpu_op_desc.SetOutput("out_max", {fusion_nodes_map["out_max"]->Name()}); fc_xpu_op_desc.SetAttr( "in_num_col_dims", static_cast(mul_x->Var()->GetShape().size() - 1)); @@ -440,48 +852,54 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope"))); } } - fc_xpu_op_desc.SetAttr("out_dtype", out_dtype); - fc_xpu_op_desc.SetOutput("out", {fc_out_name}); - fc_xpu_op_desc.SetOutput("out_max", {fc_out_max_name}); + // out_dtype is same to input precision + fc_xpu_op_desc.SetAttr("out_dtype", + fusion_nodes_map["x"]->Var()->GetDataType()); + fc_xpu_op_desc.SetAttr("enable_int8", + mul->Op()->GetAttrIfExists("enable_int8")); + if (enable_int8) { + fc_xpu_op_desc.SetAttr( + "Input_scale_" + fusion_nodes_map["out"]->Name(), + mul->Op()->GetAttrIfExists("Input_scale_" + + fusion_nodes_map["out"]->Name())); + fc_xpu_op_desc.SetAttr( + "Input_scale_" + fusion_nodes_map["x"]->Name(), + mul->Op()->GetAttrIfExists("Input_scale_" + + fusion_nodes_map["x"]->Name())); + } + auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc); - IR_NODE_LINK_TO(mul_x, fc_xpu); - IR_NODE_LINK_TO(mul_w_int16, fc_xpu); - IR_NODE_LINK_TO(mul_w_max, fc_xpu); - if (bias || bn) { - SAFE_IR_NODE_LINK_TO(fusion_bias_node, fc_xpu); + IR_NODE_LINK_TO(fusion_nodes_map["x"], fc_xpu); + if (fusion_nodes_map["x_max"]) { + IR_NODE_LINK_TO(fusion_nodes_map["x_max"], fc_xpu); } - if (act_out) { - IR_NODE_LINK_TO(fc_xpu, act_out); - } else if (bn_out) { - IR_NODE_LINK_TO(fc_xpu, bn_out); - } else if (add_out) { - IR_NODE_LINK_TO(fc_xpu, add_out); - } else { - IR_NODE_LINK_TO(fc_xpu, mul_out); + IR_NODE_LINK_TO(fusion_nodes_map["w"], fc_xpu); + IR_NODE_LINK_TO(fusion_nodes_map["w_max"], fc_xpu); + if (fusion_nodes_map["scale_max"]) { + IR_NODE_LINK_TO(fusion_nodes_map["scale_max"], fc_xpu); } - IR_NODE_LINK_TO(fc_xpu, fc_out_max); + if (fusion_nodes_map["bias"]) { + IR_NODE_LINK_TO(fusion_nodes_map["bias"], fc_xpu); + } + if (fusion_nodes_map["out_max_in"]) { + IR_NODE_LINK_TO(fusion_nodes_map["out_max_in"], fc_xpu); + } + IR_NODE_LINK_TO(fc_xpu, fusion_nodes_map["out"]); + IR_NODE_LINK_TO(fc_xpu, fusion_nodes_map["out_max"]); // delete useless node std::unordered_set delete_nodes; - if (act != nullptr && add != nullptr) { - delete_nodes = {mul, mul_out, add, add_out, act}; - } else if (act) { - delete_nodes = {mul, mul_out, act}; - } else if (add) { - delete_nodes = {mul, mul_out, add}; - } else { - delete_nodes = {mul}; + if (mul != nullptr) { + delete_nodes.insert(mul); } if (bn != nullptr) { delete_nodes.insert(bn); - delete_nodes.insert(bn_bias); - delete_nodes.insert(bn_var); - delete_nodes.insert(bn_mean); - delete_nodes.insert(bn_scale); - delete_nodes.insert(bn_var_out); - delete_nodes.insert(bn_mean_out); - delete_nodes.insert(bn_saved_var); - delete_nodes.insert(bn_saved_mean); + } + if (add != nullptr) { + delete_nodes.insert(add); + } + if (act != nullptr) { + delete_nodes.insert(act); } GraphSafeRemoveNodes(graph, delete_nodes); found_subgraph_count++; diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index 1a9db472bc2cc..3a6d29f794d65 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -77,8 +77,14 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern, const std::string& name_scope, bool with_branch) : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) { - auto* fusion_op = - pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu"); + auto* fusion_op = pattern->NewNode(fusion_op_repr()) + ->assert_is_op("conv2d_xpu") + ->assert_more([&](Node* node) { + bool enable_int8 = + node->Op()->GetAttrIfExists("enable_int8"); + return !enable_int8; + }); + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x"); PDNode* branch = nullptr; if (with_branch_) { @@ -177,7 +183,12 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const { auto preop_max_var_name = x_pre_op->Output("out_max"); for (auto max_node : x->inputs[0]->outputs) { if (preop_max_var_name[0] == max_node->Name()) { - fusion_op_desc->SetInput("x_max", {max_node->Name()}); + if (fusion_op_desc->HasInput("x_max")) { + auto x_max_old_name = fusion_op_desc->Input("x_max")[0]; + fusion_op_desc->RenameInput(x_max_old_name, max_node->Name()); + } else { + fusion_op_desc->SetInput("x_max", {max_node->Name()}); + } IR_NODE_LINK_TO(max_node, fusion_op); } } diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc index eeb0e23e19ecd..b895033108e12 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.cc +++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc @@ -121,12 +121,115 @@ size_t HashTensor(const phi::DenseTensor& in) { template size_t HashTensor(const phi::DenseTensor& in); template size_t HashTensor(const phi::DenseTensor& in); +template size_t HashTensor(const phi::DenseTensor& in); std::string GetPrefixWithoutHash(const std::string& name) { std::size_t found = name.find("_#"); return found == std::string::npos ? name : name.substr(0, found); } +template +void PrepareWeight(Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** quant_weight, + Node** quant_weight_max, + bool transpose, + const std::vector& weight_scales) { + auto weight_name = weight->Name(); + auto* weight_tensor = scope->Var(weight_name)->GetMutable(); + phi::DenseTensor quant_weight_tensor; + Assign(*weight_tensor, &quant_weight_tensor); + phi::DenseTensor quant_weight_max_tensor; + ConvertWeightWrapper( + &quant_weight_tensor, &quant_weight_max_tensor, transpose, weight_scales); + size_t quant_weight_hash = HashTensor(quant_weight_tensor); + size_t quant_weight_max_hash = HashTensor(quant_weight_max_tensor); + std::string pre_name = GetPrefixWithoutHash(weight_name); + std::string quant_weight_name = + pre_name + "_#" + std::to_string(quant_weight_hash); + std::string quant_weight_max_name = + pre_name + "_max_#" + std::to_string(quant_weight_max_hash); + *quant_weight = FindNodeWithName(graph, quant_weight_name); + if (*quant_weight == nullptr) { + // Create quant_weight node + // Update quant_weight var_desc in block + VarDesc quant_weight_desc(quant_weight_name); + quant_weight_desc.SetPersistable(true); + quant_weight_desc.SetShape(vectorize(quant_weight_tensor.dims())); + quant_weight_desc.SetDataType( + framework::TransToProtoVarType(quant_weight_tensor.dtype())); + *quant_weight = graph->CreateVarNode(&quant_weight_desc); + auto* block_quant_weight_desc = block->Var(quant_weight_name); + block_quant_weight_desc->SetPersistable(quant_weight_desc.Persistable()); + block_quant_weight_desc->SetShape(quant_weight_desc.GetShape()); + block_quant_weight_desc->SetDataType(quant_weight_desc.GetDataType()); + // Create quant_weight_max node + // Update quant_weight_max var_desc in block + VarDesc quant_weight_max_desc(quant_weight_max_name); + quant_weight_max_desc.SetPersistable(true); + quant_weight_max_desc.SetShape(vectorize(quant_weight_max_tensor.dims())); + quant_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + *quant_weight_max = graph->CreateVarNode(&quant_weight_max_desc); + auto* block_quant_weight_max_desc = block->Var(quant_weight_max_name); + block_quant_weight_max_desc->SetPersistable( + quant_weight_max_desc.Persistable()); + block_quant_weight_max_desc->SetShape(quant_weight_max_desc.GetShape()); + block_quant_weight_max_desc->SetDataType( + quant_weight_max_desc.GetDataType()); + // Find dst/dst_max variable in scope + auto* quant_weight_var = scope->FindVar(quant_weight_name); + if (quant_weight_var == nullptr) { + // Create quant_weight/quant_weight_max variable/tensor + Assign(quant_weight_tensor, + scope->Var(quant_weight_name)->GetMutable()); + Assign(quant_weight_max_tensor, + scope->Var(quant_weight_max_name)->GetMutable()); + } else { + // Share the same variable + PADDLE_ENFORCE_NOT_NULL( + scope->FindVar(quant_weight_max_name), + platform::errors::Fatal("quant_weight_max(%s) variable should not be " + "nullptr if quant_weight(%s) " + "variable is exist. (weight_name is %s)", + quant_weight_max_name, + quant_weight_name, + weight_name)); + } + } else { + *quant_weight_max = FindNodeWithName(graph, quant_weight_max_name); + PADDLE_ENFORCE_NOT_NULL( + *quant_weight_max, + platform::errors::Fatal("quant_weight_max(%s) variable should not be " + "nullptr if quant_weight(%s) " + "variable is exist. (weight_name is %s)", + quant_weight_max_name, + quant_weight_name, + weight_name)); + } +} + +template void PrepareWeight( + Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** quant_weight, + Node** quant_weight_max, + bool transpose, + const std::vector& weight_scales); + +template void PrepareWeight( + Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** quant_weight, + Node** quant_weight_max, + bool transpose, + const std::vector& weight_scales); + template void PrepareWeight(Graph* graph, Scope* scope, @@ -137,6 +240,7 @@ void PrepareWeight(Graph* graph, bool transpose) { auto src_name = src->Name(); auto* src_tensor = scope->Var(src_name)->GetMutable(); + phi::DenseTensor dst_tensor; Assign(*src_tensor, &dst_tensor); phi::DenseTensor dst_max_tensor; diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h index d1e7b218a0b46..556a14fa0e9e4 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.h +++ b/paddle/fluid/framework/ir/xpu/pass_utils.h @@ -57,6 +57,28 @@ std::vector FindOpNodeByInputName(Graph* graph, template size_t HashTensor(const phi::DenseTensor& in); +template ::value, Tcpu>::type* + ptr = nullptr> +void ConvertWeightWrapper(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales) { + ConvertWithQuant(weight, weight_max, transpose, weight_scales); +} + +template ::value, Tcpu>::type* + ptr = nullptr> +void ConvertWeightWrapper(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales) { + ConvertWithoutQuant(weight, weight_max, transpose, weight_scales); +} + template void PrepareWeight(Graph* graph, Scope* scope, @@ -66,6 +88,16 @@ void PrepareWeight(Graph* graph, Node** dst_max, bool transpose); +template +void PrepareWeight(Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** quant_weight, + Node** quant_weight_max, + bool transpose, + const std::vector& weight_scales); + void PrepareBias( Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst); diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index fcda50051a362..ada4a4b9b6c2f 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -145,6 +145,41 @@ void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out) { } } +void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out) { + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + + paddle::experimental::CheckAndTrans2Contiguous(in); + + phi::DenseTensor int8_tensor; + phi::DenseTensor* out_ptr = out == nullptr ? &int8_tensor : out; + out_ptr->Resize(in->dims()); + out_ptr->set_type(phi::DataType::INT8); + out_ptr->set_layout(in->layout()); + + switch (in->dtype()) { + case phi::DataType::FLOAT32: + phi::CastKernel(*cpu_ctx, *in, phi::DataType::INT8, out_ptr); + break; + case phi::DataType::INT8: + if (out == nullptr) { + return; + } else { + phi::AssignKernel(*cpu_ctx, *in, out_ptr); + } + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support fp32, but received dtype is %s.", + phi::DataTypeToString(in->dtype()))); + break; + } + + if (out == nullptr) { + Assign(*out_ptr, in); + } +} + static float FindMaxAbs(const float* data, int len) { float max_f = 0.0f; for (int i = 0; i < len; ++i) { @@ -258,6 +293,100 @@ void QuantFP32ToIntX(const float* src_ptr, } } +template < + typename Tcpu, + typename Txpu, + typename std::enable_if::value, Tcpu>::type* ptr> +void ConvertWithQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales) { + LOG(FATAL) << "Not support for Tcpu is " + << phi::CppTypeToDataType::Type(); +} + +template < + typename Tcpu, + typename Txpu, + typename std::enable_if::value, Tcpu>::type* ptr> +void ConvertWithQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales) { + if (!weight_scales.empty()) { + LOG(FATAL) << "Weight scales should be empty(), otherwise, check if your " + "model is quant model or not."; + } + + // Convert fp16 to fp32 + phi::DenseTensor weight_fp32; + CastToFp32(weight, &weight_fp32); + + if (transpose) { + Transpose2D(&weight_fp32); + } + + // Find max + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + int size = weight_fp32.numel(); + auto* weight_data = weight_fp32.data(); + float max_val = FindMaxAbs(weight_data, size); + std::vector max_vec(max_ptr_size, max_val); + weight_max->set_type(phi::DataType::FLOAT32); + weight_max->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + memcpy(cpu_ctx->Alloc(weight_max), + max_vec.data(), + max_ptr_size * sizeof(float)); + + // Quant + weight->set_type(phi::CppTypeToDataType::Type()); + weight->Resize(weight_fp32.dims()); + QuantFP32ToIntX( + weight_data, cpu_ctx->Alloc(weight), max_val, size); +} + +template +void ConvertWithoutQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales) { + if (transpose) { + Transpose2D(weight); + } + if (std::is_same::value || std::is_same::value) { + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + int max_ptr_size = weight_scales.empty() + ? phi::backends::xpu::get_xpu_max_ptr_size(-1) + : weight_scales.size(); + weight_max->set_type(phi::DataType::FLOAT32); + weight_max->Resize({max_ptr_size}); + if (!weight_scales.empty()) { + memcpy(cpu_ctx->Alloc(weight_max), + weight_scales.data(), + max_ptr_size * sizeof(float)); + } else { + LOG(FATAL) << "weight scales cannot be empty!"; + } + } else { + LOG(FATAL) << "Only support int8<->int8 and int16<->int16 convert."; + } +} + +template void ConvertWithQuant( + phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); + +template void ConvertWithoutQuant( + phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); + template void PrepareWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, @@ -298,6 +427,23 @@ template void PrepareWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose); +bool IsPerTensorQuant(const std::vector& weight_max) { + bool per_tensor = true; + PADDLE_ENFORCE_GT( + weight_max.size(), + 0, + platform::errors::InvalidArgument( + "Op's channel size: [%d] should great than zero", weight_max.size())); + auto first = weight_max[0]; + for (size_t i = 1; i < weight_max.size(); ++i) { + if (std::abs(first - weight_max[i]) > 1e-6) { + per_tensor = false; + break; + } + } + return per_tensor; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h index b417fa03323db..30f73023b632d 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.h +++ b/paddle/fluid/framework/ir/xpu/quant_utils.h @@ -25,8 +25,34 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); +void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); + void CastToInt32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); +template +void ConvertWithoutQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); + +template ::value, Tcpu>::type* + ptr = nullptr> +void ConvertWithQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); + +template ::value, + Tcpu>::type* ptr = nullptr> +void ConvertWithQuant(phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); + // 1. Quant weight from fp32 to int16/int31 // 2. Weight data is in-place update. // 3. Generate weight max tensor @@ -35,6 +61,8 @@ void PrepareWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose); +bool IsPerTensorQuant(const std::vector& weight_max); + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 0af6876faca05..2561e14d06d1e 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -13,8 +13,13 @@ cc_library( cc_library( convert_to_mixed_precision SRCS convert_to_mixed_precision.cc - DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass - constant_folding_pass identity_op_clean_pass) + DEPS analysis_pass + ir_graph_build_pass + auto_mixed_precision_pass + constant_folding_pass + identity_op_clean_pass + delete_quant_dequant_linear_op_pass + delete_weight_dequant_linear_op_pass) cc_library( ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index d706113307009..ef352712102c4 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -17,6 +17,8 @@ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h" #include "paddle/fluid/framework/ir/constant_folding_pass.h" +#include "paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.h" +#include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/identity_op_clean_pass.h" #include "paddle/fluid/inference/io.h" @@ -89,12 +91,22 @@ void ConvertToMixedPrecisionPass::LoadModel() { void ConvertToMixedPrecisionPass::Run() { LoadModel(); + if (backend_ == phi::Backend::XPU) { + framework::ir::DeleteQuantDequantLinearOpPass + delete_quant_dequant_linear_op_pass; + delete_quant_dequant_linear_op_pass.Apply(main_graph_.get()); + framework::ir::DeleteWeightDequantLinearOpPass + delete_weight_dequant_linear_op_pass; + delete_weight_dequant_linear_op_pass.Apply(main_graph_.get()); + } + framework::ir::ConstantFoldingPass constant_folding_pass; constant_folding_pass.Apply(main_graph_.get()); framework::ir::AutoMixedPrecisionPass auto_mixed_precision_pass; auto_mixed_precision_pass.Set("mixed_precision_mode", new int{static_cast(mixed_precision_)}); + if (backend_ == phi::Backend::GPU) { auto_mixed_precision_pass.Set("enable_gpu_mixed", new bool{true}); } else if (backend_ == phi::Backend::XPU) { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index c7f3f87a4d192..65fd8a74aa101 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -507,6 +507,8 @@ void CpuPassStrategy::EraseFcMkldnnPasses() { XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { passes_.assign({ + "delete_quant_dequant_linear_op_pass", + "delete_weight_dequant_linear_op_pass", "delete_assign_op_pass", "delete_dropout_op_pass", "delete_concat_op_pass", @@ -562,6 +564,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "delete_isolated_node_pass", // "auto_mixed_precision_pass", "cast_mixed_precision_op_fuse_pass", + "auto_trans_quantize_op_precision_pass", "inplace_op_var_pass", }); use_xpu_ = true; diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml index 226c87b35d458..7d8f28c61d49f 100644 --- a/paddle/phi/api/yaml/fused_ops.yaml +++ b/paddle/phi/api/yaml/fused_ops.yaml @@ -62,14 +62,14 @@ optional : bias, x_max - op : conv2d_xpu - args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, int act_type, float act_param, DataType out_dtype) + args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, Tensor scale_max, Tensor out_max_in, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, int act_type, float act_param, DataType out_dtype) output : Tensor(out), Tensor(out_max) infer_meta : func : Conv2dXPUInferMeta kernel : func : conv2d_xpu data_type : x - optional : bias, branch, branch_max ,x_max + optional : bias, branch, branch_max ,x_max, scale_max, out_max_in - op : embedding_with_eltwise_add_xpu args : (Tensor[] ids, Tensor[] tables, Tensor mask, int64_t padding_idx) @@ -101,14 +101,14 @@ data_type : x - op : fc_xpu - args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha, DataType out_dtype) + args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, Tensor scale_max, Tensor out_max_in, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha, DataType out_dtype) output : Tensor(out), Tensor(out_max) infer_meta : func : FcXPUInferMeta kernel : func : fc_xpu data_type : x - optional : bias, x_max + optional : bias, x_max, scale_max, out_max_in - op : fused_bias_act args : (Tensor x, Tensor bias, Tensor dequant_scales, Tensor shift, Tensor smooth, str act_method = "gelu", str compute_dtype = "default", float quant_scale = -1, int quant_round_type = 1, float quant_max_bound = 127.0, float quant_min_bound = -127.0) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 39defa8bdddd7..f8139af52cb22 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -176,7 +176,9 @@ XPUOpMap& get_kl2_ops() { {"conv1d_xpu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"conv2d_xpu", - XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::INT8})}, {"conv3d_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"conv3d", @@ -317,7 +319,9 @@ XPUOpMap& get_kl2_ops() { {"fast_layernorm_xpu", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"fc_xpu", - XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + XPUKernelSet({phi::DataType::FLOAT32, + phi::DataType::FLOAT16, + phi::DataType::INT8})}, {"fill", XPUKernelSet({phi::DataType::INT64, phi::DataType::INT32, diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc index 8dfdf7f89fde7..679eb70ccfd01 100644 --- a/paddle/phi/infermeta/fusion.cc +++ b/paddle/phi/infermeta/fusion.cc @@ -227,6 +227,8 @@ void Conv2dXPUInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& branch, const MetaTensor& branch_max, + const MetaTensor& scale_max, + const MetaTensor& out_max_in, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -377,6 +379,8 @@ void FcXPUInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& w_max, const MetaTensor& bias, + const MetaTensor& scale_max, + const MetaTensor& out_max_in, int in_num_col_dims, bool transpose_x, float alpha, diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h index ecda5cb9c8818..08469f4cec577 100644 --- a/paddle/phi/infermeta/fusion.h +++ b/paddle/phi/infermeta/fusion.h @@ -62,6 +62,8 @@ void Conv2dXPUInferMeta(const MetaTensor& x, const MetaTensor& bias, const MetaTensor& branch, const MetaTensor& branch_max, + const MetaTensor& scale_max, + const MetaTensor& out_max_in, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -86,6 +88,8 @@ void FcXPUInferMeta(const MetaTensor& x, const MetaTensor& w, const MetaTensor& w_max, const MetaTensor& bias, + const MetaTensor& scale_max, + const MetaTensor& out_max_in, int in_num_col_dims, bool transpose_x, float alpha, diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc index 43caa13698b48..9dce663de72c7 100644 --- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc @@ -12,9 +12,12 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "glog/logging.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/common/memory_utils.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/cpu/conv_util.h" +#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h" namespace phi { namespace fusion { @@ -32,6 +35,8 @@ void Conv2dXPUKernelImpl(const Context& ctx, const paddle::optional& bias, const paddle::optional& branch, const paddle::optional& branch_max, + const paddle::optional& scale_max, + const paddle::optional& out_max_in, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -66,14 +71,22 @@ void Conv2dXPUKernelImpl(const Context& ctx, int out_c = static_cast(filter_dims[0]); int win_h = static_cast(filter_dims[2]); int win_w = static_cast(filter_dims[3]); - + VLOG(1) << "KERNEL1"; auto* input_data = reinterpret_cast(x.data()); + VLOG(1) << "KERNEL1.5"; const float* input_max_data = x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data(); + VLOG(1) << "KERNEL2"; auto* filter_data = reinterpret_cast(filter.data()); auto* filter_max_data = filter_max.data(); + auto* scale_max_data = scale_max.get_ptr() == nullptr + ? nullptr + : scale_max.get_ptr()->data(); const XPUTypeOut* branch_data = nullptr; + const float* branch_max_data = branch_max.get_ptr() == nullptr + ? nullptr + : branch_max.get_ptr()->data(); auto* branch_tensor = branch.get_ptr(); xpu::ctx_guard RAII_GUARD(ctx.x_context()); if (branch_tensor != nullptr) { @@ -81,32 +94,269 @@ void Conv2dXPUKernelImpl(const Context& ctx, branch_data = reinterpret_cast(branch_tensor->data()); } else { - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::cast( - ctx.x_context(), - reinterpret_cast(branch_tensor->data()), - branch_data_temp, - branch_tensor->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); - branch_data = branch_data_temp; + if (branch_tensor->dtype() == phi::DataType::FLOAT32 && + out->dtype() == phi::DataType::INT8) { + VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT32 && " + "out->dtype() == phi::DataType::INT8"; + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::quantization( + ctx.x_context(), + reinterpret_cast(branch_tensor->data()), + branch_data_temp, + branch_tensor->numel(), + branch_max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); + branch_data = reinterpret_cast(branch_data_temp); + } else if (branch_tensor->dtype() == phi::DataType::FLOAT16 && + out->dtype() == phi::DataType::INT8) { + VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT16 && " + "out->dtype() == phi::DataType::INT8"; + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::quantization( + ctx.x_context(), + reinterpret_cast( + branch_tensor->data()), + branch_data_temp, + branch_tensor->numel(), + branch_max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); + branch_data = reinterpret_cast(branch_data_temp); + } else if (branch_tensor->dtype() == phi::DataType::INT8 && + out->dtype() == phi::DataType::FLOAT32) { + VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && " + "out->dtype() == phi::DataType::FLOAT32"; + // if (branch_tensor) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // branch.get_ptr()->dtype(), + // branch.get_ptr()->numel() * sizeof(int8_t)); + // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, + // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_data_quantize_before[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::dequantization( + ctx.x_context(), + reinterpret_cast(branch_tensor->data()), + branch_data_temp, + branch_tensor->numel(), + branch_max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); + // if (branch_tensor) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // phi::DataType::FLOAT32, + // branch.get_ptr()->numel() * sizeof(float)); + // memory_utils::Copy(CPUPlace(), + // static_cast(temp_tensor_cpu.data()), + // ctx.GetPlace(), + // static_cast(branch_data_temp), + // branch.get_ptr()->numel() * sizeof(float)); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_data_quantize_after[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + branch_data = reinterpret_cast(branch_data_temp); + } else if (branch_tensor->dtype() == phi::DataType::INT8 && + out->dtype() == phi::DataType::FLOAT16) { + VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && " + "out->dtype() == phi::DataType::FLOAT16"; + // if (branch_tensor) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // branch.get_ptr()->dtype(), + // branch.get_ptr()->numel() * sizeof(int8_t)); + // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, + // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_data_quantize_before[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::dequantization( + ctx.x_context(), + reinterpret_cast(branch_tensor->data()), + branch_data_temp, + branch_tensor->numel(), + branch_max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); + // if (branch_tensor) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // phi::DataType::FLOAT16, + // branch.get_ptr()->numel() * + // sizeof(dtype::float16)); + // memory_utils::Copy(CPUPlace(), + // static_cast(temp_tensor_cpu.data()), + // ctx.GetPlace(), + // static_cast(branch_data_temp), + // branch.get_ptr()->numel() * sizeof(dtype::float16)); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_data_quantize_after[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + branch_data = reinterpret_cast(branch_data_temp); + } else { + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::cast( + ctx.x_context(), + reinterpret_cast(branch_tensor->data()), + branch_data_temp, + branch_tensor->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + branch_data = branch_data_temp; + } } } - const float* branch_max_data = branch_max.get_ptr() == nullptr - ? nullptr - : branch_max.get_ptr()->data(); + VLOG(1) << "KERNEL3"; const float* bias_data = bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); auto* out_data = reinterpret_cast(ctx.template Alloc(out)); auto* out_max_data = ctx.template Alloc(out_max); + out_max_data = out_max_in.get_ptr() != nullptr + ? const_cast(out_max_in.get_ptr()->data()) + : out_max_data; + VLOG(1) << "KERNEL4.5"; xpu::Activation_t act(static_cast(act_type)); + VLOG(1) << "KERNEL5"; if (act_type == xpu::Activation_t::LEAKY_RELU) { act.leaky_alpha = act_param; } else if (act_type == xpu::Activation_t::HARD_SIGMOID) { act.hard_sigmoid_slope = act_param; } + // if (input_max_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // x_max.get_ptr()->dtype(), + // x_max.get_ptr()->numel() * sizeof(float)); + // phi::Copy(ctx, *x_max.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) { + // VLOG(1) << "input_max_data[" << i + // << "]:" << temp_tensor_cpu.data()[i]; + // } + // } + + // if (filter_max_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // filter_max.dtype(), + // filter_max.numel() * sizeof(float)); + // phi::Copy(ctx, filter_max, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) { + // VLOG(1) << "filter_max_data[" << i + // << "]:" << temp_tensor_cpu.data()[i]; + // } + // } + + // if (input_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, x.dtype(), x.numel() * sizeof(T_X)); + // phi::Copy(ctx, x, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "input_data[" << i + // << "]:" << static_cast(temp_tensor_cpu.data()[i]); + // } + // } + + // if (filter_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W)); + // phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "filter_data[" << i + // << "]:" << static_cast(temp_tensor_cpu.data()[i]); + // } + // } + + // if (bias_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // bias.get_ptr()->dtype(), + // bias.get_ptr()->numel() * sizeof(float)); + // phi::Copy(ctx, *bias.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "bias_data[" << i << "]:" << + // temp_tensor_cpu.data()[i]; + // } + // } + + // if (branch_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // branch.get_ptr()->dtype(), + // branch.get_ptr()->numel() * sizeof(T_OUT)); + // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_data[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + + // if (branch_max) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // branch_max.get_ptr()->dtype(), + // branch_max.get_ptr()->numel() * sizeof(float)); + // phi::Copy(ctx, *branch_max.get_ptr(), CPUPlace(), false, + // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "branch_max_data[" << i + // << "]:" << temp_tensor_cpu.data()[i]; + // } + // } + + // if (scale_max) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc(&temp_tensor_cpu, + // scale_max.get_ptr()->dtype(), + // scale_max.get_ptr()->numel() * sizeof(float)); + // phi::Copy(ctx, *scale_max.get_ptr(), CPUPlace(), false, + // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "scale_max_data[" << i + // << "]:" << temp_tensor_cpu.data()[i]; + // } + // } + // if (filter_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W)); + // phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "filter_data[" << i + // << "]:" << static_cast(temp_tensor_cpu.data()[i]); + // } + // } + + // if (out_max_in.get_ptr()) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, out_max_in.get_ptr()->dtype(), + // out_max_in.get_ptr()->numel() * sizeof(float)); + // phi::Copy(ctx, *out_max_in.get_ptr(), CPUPlace(), false, + // &temp_tensor_cpu); for (size_t i = 0; i < out_max_in.get_ptr()->numel(); + // ++i) { + // VLOG(1) << "output_max_data_before[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } int r = xpu:: conv2d_fusion( // TX/TW/TY/TGEMM /* baidu::xpu::api::Context* ctx */ ctx.x_context(), @@ -131,8 +381,32 @@ void Conv2dXPUKernelImpl(const Context& ctx, /* const TY* branch */ branch_data, /* const baidu::xpu::api::Activation_t& act */ act, /* const float* branch_maxptr */ branch_max_data, - /* const float* scale */ nullptr); + /* const float* scale */ scale_max_data); PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu"); + // if (out_data) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, out->dtype(), out->numel() * sizeof(T_OUT)); + // phi::Copy(ctx, *out, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "output_data[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } + + // if (out_max) { + // DenseTensor temp_tensor_cpu; + // ctx.template HostAlloc( + // &temp_tensor_cpu, out_max->dtype(), out_max->numel() * + // sizeof(float)); + // phi::Copy(ctx, *out_max, CPUPlace(), false, &temp_tensor_cpu); + // for (size_t i = 0; i < 50; ++i) { + // VLOG(1) << "output_max_data_after[" << i + // << "]:" << + // static_cast(temp_tensor_cpu.data()[i]); + // } + // } } #define CONV2D_XPU_KERNEL_IMPL(x_dtype_, w_dtype_, out_dtype_, gemm_dtype_) \ @@ -145,6 +419,8 @@ void Conv2dXPUKernelImpl(const Context& ctx, bias, \ branch, \ branch_max, \ + scale_max, \ + out_max_in, \ paddings, \ dilations, \ strides, \ @@ -164,6 +440,8 @@ void Conv2dXPUKernel(const Context& ctx, const paddle::optional& bias, const paddle::optional& branch, const paddle::optional& branch_max, + const paddle::optional& scale_max, + const paddle::optional& out_max_in, const std::vector& paddings, const std::vector& dilations, const std::vector& strides, @@ -174,14 +452,118 @@ void Conv2dXPUKernel(const Context& ctx, DataType out_dtype, DenseTensor* out, DenseTensor* out_max) { - if (out_dtype == DataType::FLOAT32) { - CONV2D_XPU_KERNEL_IMPL(T, int16_t, float, int16_t); - } else if (out_dtype == DataType::FLOAT16) { - CONV2D_XPU_KERNEL_IMPL(T, int16_t, dtype::float16, int16_t); - } else { - PADDLE_THROW(phi::errors::Unimplemented("Not support out_dtype is %s.", - DataTypeToString(out_dtype))); + // Dont use template T param + VLOG(1) << "Kernel type: " << x.dtype() << "," << filter.dtype() << " ," + << out_dtype; + if (x.dtype() == DataType::FLOAT32) { + // float32/float16 kernel + if (filter.dtype() == DataType::INT16) { + if (out_dtype == DataType::FLOAT32) { + CONV2D_XPU_KERNEL_IMPL(float, int16_t, float, int16_t); + } else if (out_dtype == DataType::FLOAT16) { + CONV2D_XPU_KERNEL_IMPL(float, int16_t, dtype::float16, int16_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + } else if (filter.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT32) { + CONV2D_XPU_KERNEL_IMPL(float, int8_t, float, int8_t); + } else if (out_dtype == DataType::INT8) { + CONV2D_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + return; } + + if (x.dtype() == DataType::FLOAT16) { + // float16 kernel + if (filter.dtype() == DataType::INT16) { + if (out_dtype == DataType::FLOAT32) { + CONV2D_XPU_KERNEL_IMPL(phi::dtype::float16, int16_t, float, int16_t); + } else if (out_dtype == DataType::FLOAT16) { + CONV2D_XPU_KERNEL_IMPL( + phi::dtype::float16, int16_t, dtype::float16, int16_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + } else if (filter.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT16) { + CONV2D_XPU_KERNEL_IMPL( + phi::dtype::float16, int8_t, dtype::float16, int8_t); + } else if (out_dtype == DataType::INT8) { + CONV2D_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + return; + } + + if (x.dtype() == DataType::INT8) { + if (filter.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT32) { + CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, float, int8_t); + } else if (out_dtype == DataType::FLOAT16) { + CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, dtype::float16, int8_t); + } else if (out_dtype == DataType::INT8) { + CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); + } + return; + } + + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(filter.dtype()), + DataTypeToString(out_dtype))); } } // namespace fusion @@ -192,4 +574,5 @@ PD_REGISTER_KERNEL(conv2d_xpu, ALL_LAYOUT, phi::fusion::Conv2dXPUKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + int8_t) {} diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc index 6a6721194e9a8..f2acd0893a6f7 100644 --- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "glog/logging.h" #include "paddle/phi/backends/xpu/enforce_xpu.h" #include "paddle/phi/core/kernel_registry.h" @@ -29,6 +30,8 @@ void FcXPUKernelImpl(const Context& ctx, const DenseTensor& w, const DenseTensor& w_max, const paddle::optional& bias, + const paddle::optional& scale_max, + const paddle::optional& out_max_in, int in_num_col_dims, bool transpose_x, float alpha, @@ -53,7 +56,13 @@ void FcXPUKernelImpl(const Context& ctx, bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); auto* out_data = reinterpret_cast(ctx.template Alloc(out)); + auto* scale_max_data = scale_max.get_ptr() == nullptr + ? nullptr + : scale_max.get_ptr()->data(); auto* out_max_data = ctx.template Alloc(out_max); + out_max_data = out_max_in.get_ptr() != nullptr + ? const_cast(out_max_in.get_ptr()->data()) + : out_max_data; xpu::Activation_t act(static_cast(act_type)); if (act_type == xpu::Activation_t::LEAKY_RELU) { act.leaky_alpha = act_alpha; @@ -80,7 +89,9 @@ void FcXPUKernelImpl(const Context& ctx, alpha, // alpha beta, // beta bias_data, // bias - act); + act, // act + scale_max_data); // scale + PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu"); } @@ -92,6 +103,8 @@ void FcXPUKernelImpl(const Context& ctx, w, \ w_max, \ bias, \ + scale_max, \ + out_max_in, \ in_num_col_dims, \ transpose_x, \ alpha, \ @@ -108,6 +121,8 @@ void FcXPUKernel(const Context& ctx, const DenseTensor& w, const DenseTensor& w_max, const paddle::optional& bias, + const paddle::optional& scale_max, + const paddle::optional& out_max_in, int in_num_col_dims, bool transpose_x, float alpha, @@ -117,14 +132,117 @@ void FcXPUKernel(const Context& ctx, DataType out_dtype, DenseTensor* out, DenseTensor* out_max) { - if (out_dtype == DataType::FLOAT32) { - FC_XPU_KERNEL_IMPL(T, int16_t, float, int16_t); - } else if (out_dtype == DataType::FLOAT16) { - FC_XPU_KERNEL_IMPL(T, int16_t, dtype::float16, int16_t); - } else { - PADDLE_THROW(phi::errors::Unimplemented("Not support out_dtype is %s.", - DataTypeToString(out_dtype))); + // Dont use template T param + VLOG(1) << "Kernel type: " << x.dtype() << "," << w.dtype() << " ," + << out_dtype; + if (x.dtype() == DataType::FLOAT32) { + // float32/float16 kernel + if (w.dtype() == DataType::INT16) { + if (out_dtype == DataType::FLOAT32) { + FC_XPU_KERNEL_IMPL(float, int16_t, float, int16_t); + } else if (out_dtype == DataType::FLOAT16) { + FC_XPU_KERNEL_IMPL(float, int16_t, dtype::float16, int16_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + } else if (w.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT32) { + FC_XPU_KERNEL_IMPL(float, int8_t, float, int8_t); + } else if (out_dtype == DataType::INT8) { + FC_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + return; + } + + if (x.dtype() == DataType::FLOAT16) { + // float16 kernel + if (w.dtype() == DataType::INT16) { + if (out_dtype == DataType::FLOAT32) { + FC_XPU_KERNEL_IMPL(phi::dtype::float16, int16_t, float, int16_t); + } else if (out_dtype == DataType::FLOAT16) { + FC_XPU_KERNEL_IMPL( + phi::dtype::float16, int16_t, dtype::float16, int16_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + } else if (w.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT16) { + FC_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, dtype::float16, int8_t); + } else if (out_dtype == DataType::INT8) { + FC_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + return; } + + if (x.dtype() == DataType::INT8) { + if (w.dtype() == DataType::INT8) { + if (out_dtype == DataType::FLOAT32) { + FC_XPU_KERNEL_IMPL(int8_t, int8_t, float, int8_t); + } else if (out_dtype == DataType::FLOAT16) { + FC_XPU_KERNEL_IMPL(int8_t, int8_t, dtype::float16, int8_t); + } else if (out_dtype == DataType::INT8) { + FC_XPU_KERNEL_IMPL(int8_t, int8_t, int8_t, int8_t); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is " + "%s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); + } + return; + } + + PADDLE_THROW(phi::errors::Unimplemented( + "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.", + DataTypeToString(x.dtype()), + DataTypeToString(w.dtype()), + DataTypeToString(out_dtype))); } } // namespace fusion @@ -135,4 +253,5 @@ PD_REGISTER_KERNEL(fc_xpu, ALL_LAYOUT, phi::fusion::FcXPUKernel, float, - phi::dtype::float16) {} + phi::dtype::float16, + int8_t) {} From 9483e72100f906d901a970090464bfee81196ad8 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Wed, 20 Sep 2023 11:05:54 +0800 Subject: [PATCH 02/15] support fc_xpu int8 --- .../auto_trans_quantize_op_precision_pass.cc | 54 +++++++++---------- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 12 +++-- .../framework/ir/xpu/link_xpu_op_max_pass.cc | 15 +++++- paddle/fluid/framework/ir/xpu/quant_utils.cc | 5 +- .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc | 27 ++++++++++ .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 4 +- 6 files changed, 80 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc index c8b4b7c040f7e..9fec1091bd9a9 100644 --- a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc +++ b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc @@ -44,7 +44,7 @@ class AutoTransQuantizeOpPrecisionPass : public FusePassBase { const std::string name_scope_{"auto_trans_quantize_op_precision_pass"}; const std::unordered_set support_fusion_quant_op_type_{ - "conv2d_xpu"}; + "conv2d_xpu", "fc_xpu"}; }; static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph, @@ -72,35 +72,33 @@ void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const { bool enable_int8 = op_node->Op()->GetAttrIfExists("enable_int8"); int out_dtype = op_node->Op()->GetAttrIfExists("out_dtype"); if (enable_int8) { - if (op_type == "conv2d_xpu") { - auto* out_var_node = - GetOpOutVarNodeByArgsName(subgraph, op_node, "out"); - PADDLE_ENFORCE_NOT_NULL( - out_var_node, - platform::errors::InvalidArgument( - "out_var_node in graph cannot be nullptr.")); - bool is_int8_out = true; - for (auto* next_op_node : out_var_node->outputs) { - auto next_op_type = next_op_node->Op()->Type(); - bool is_next_op_support_int8 = - next_op_node->Op()->GetAttrIfExists("enable_int8") && - ((support_fusion_quant_op_type_.find(next_op_type) != - support_fusion_quant_op_type_.end())); - if (!is_next_op_support_int8) { - is_int8_out = false; - break; - } - } - if (is_int8_out) { - op_node->Op()->SetAttr( - "out_dtype", - static_cast(proto::VarType::Type::VarType_Type_INT8)); - out_var_node->Var()->SetDataType( - proto::VarType::Type::VarType_Type_INT8); - VLOG(1) << "The out var node " << out_var_node->Name() - << " is INT8"; + auto* out_var_node = + GetOpOutVarNodeByArgsName(subgraph, op_node, "out"); + PADDLE_ENFORCE_NOT_NULL( + out_var_node, + platform::errors::InvalidArgument( + "out_var_node in graph cannot be nullptr.")); + bool is_int8_out = true; + for (auto* next_op_node : out_var_node->outputs) { + auto next_op_type = next_op_node->Op()->Type(); + bool is_next_op_support_int8 = + next_op_node->Op()->GetAttrIfExists("enable_int8") && + ((support_fusion_quant_op_type_.find(next_op_type) != + support_fusion_quant_op_type_.end())); + if (!is_next_op_support_int8) { + is_int8_out = false; + break; } } + if (is_int8_out) { + op_node->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + out_var_node->Var()->SetDataType( + proto::VarType::Type::VarType_Type_INT8); + VLOG(1) << "The out var node " << out_var_node->Name() + << " is INT8"; + } } } } diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 5868db5627021..f087b7caf20ab 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -367,8 +367,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( } // Get Weight scale in int8 scene std::vector weight_scale = - mul->Op()->GetAttrIfExists>("Input_scale_" + - mul_w->Name()); + mul->Op()->GetAttrIfExists>("weight_scale"); // Create fusion_bias_node auto filter_dims = filter_t->dims(); bool has_bias = with_bn || with_bias; @@ -754,8 +753,11 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, GET_IR_NODE(act); GET_IR_NODE(act_out); std::map> nodes_map; - nodes_map.insert( - {"mul", {{"mul_x", mul_x}, {"mul_w", mul_w}, {"mul_out", mul_out}}}); + nodes_map.insert({"mul", + {{"mul", mul}, + {"mul_x", mul_x}, + {"mul_w", mul_w}, + {"mul_out", mul_out}}}); nodes_map.insert({"ew_bias_add", {{"ew_bias_add", add}, {"ew_bias_add_bias", bias}, @@ -785,7 +787,7 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, bool enable_int8 = mul->Op()->GetAttrIfExists("enable_int8"); std::string op_precision_str = enable_int8 ? "int8" : "fp32"; - VLOG(4) << "FC fusion fuse pass is running on " << op_precision_str + VLOG(1) << "FC fusion fuse pass is running on " << op_precision_str << " precision!"; auto* block = mul->Op()->Block(); CreateFusionWeightsAndBias(graph, diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index 3a6d29f794d65..d9ab5448d0fda 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -106,7 +106,13 @@ struct LinkFcPattern : public PatternBase { LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, name_scope) { - auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu"); + auto* fusion_op = pattern->NewNode(fusion_op_repr()) + ->assert_is_op("fc_xpu") + ->assert_more([&](Node* node) { + bool enable_int8 = + node->Op()->GetAttrIfExists("enable_int8"); + return !enable_int8; + }); auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x"); fusion_op->LinksFrom({x}); @@ -231,7 +237,12 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const { auto preop_max_var_name = x_pre_op->Output("out_max"); for (auto max_node : x->inputs[0]->outputs) { if (preop_max_var_name[0] == max_node->Name()) { - fusion_op_desc->SetInput("x_max", {max_node->Name()}); + if (fusion_op_desc->HasInput("x_max")) { + auto x_max_old_name = fusion_op_desc->Input("x_max")[0]; + fusion_op_desc->RenameInput(x_max_old_name, max_node->Name()); + } else { + fusion_op_desc->SetInput("x_max", {max_node->Name()}); + } IR_NODE_LINK_TO(max_node, fusion_op); } } diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index ada4a4b9b6c2f..90ca41f72958e 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -64,9 +64,12 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out) { case phi::DataType::FLOAT32: phi::TransposeKernel(*cpu_ctx, *in, axis, out_ptr); break; + case phi::DataType::INT8: + phi::TransposeKernel(*cpu_ctx, *in, axis, out_ptr); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( - "Only support fp16 and fp32, but received dtype is %s.", + "Only support fp16/fp32/int8, but received dtype is %s.", phi::DataTypeToString(in->dtype()))); break; } diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc index 8383501c30b8f..fff3c4020b544 100644 --- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc @@ -286,6 +286,33 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const { desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x")); desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y")); desc.SetAttr("alpha", 1.0f); + if (matmul_v2->Op()->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", matmul_v2->Op()->GetAttr("enable_int8")); + } + if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_x->Name())) { + desc.SetAttr("Input_scale_" + matmul_x->Name(), + matmul_v2->Op()->GetAttr("Input_scale_" + matmul_x->Name())); + } + if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_y->Name())) { + desc.SetAttr("Input_scale_" + matmul_y->Name(), + matmul_v2->Op()->GetAttr("Input_scale_" + matmul_y->Name())); + } + if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_out->Name())) { + desc.SetAttr( + "Input_scale_" + matmul_out->Name(), + matmul_v2->Op()->GetAttr("Input_scale_" + matmul_out->Name())); + } + if (matmul_v2->Op()->HasAttr("weight_scale")) { + desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale")); + } + if (matmul_v2->Op()->HasAttr("weight_bit_length")) { + desc.SetAttr("weight_bit_length", + matmul_v2->Op()->GetAttr("weight_bit_length")); + } + if (matmul_v2->Op()->HasAttr("weight_quant_axis")) { + desc.SetAttr("weight_quant_axis", + matmul_v2->Op()->GetAttr("weight_quant_axis")); + } if (matmul_v2->Op()->HasAttr("use_mkldnn")) { desc.SetAttr("use_mkldnn", matmul_v2->Op()->GetAttr("use_mkldnn")); } diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc index f2acd0893a6f7..eeb36a86eeec7 100644 --- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -133,7 +133,7 @@ void FcXPUKernel(const Context& ctx, DenseTensor* out, DenseTensor* out_max) { // Dont use template T param - VLOG(1) << "Kernel type: " << x.dtype() << "," << w.dtype() << " ," + VLOG(1) << "Kernel type: " << x.dtype() << " ," << w.dtype() << " ," << out_dtype; if (x.dtype() == DataType::FLOAT32) { // float32/float16 kernel @@ -155,6 +155,8 @@ void FcXPUKernel(const Context& ctx, FC_XPU_KERNEL_IMPL(float, int8_t, float, int8_t); } else if (out_dtype == DataType::INT8) { FC_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t); + } else if (out_dtype == DataType::FLOAT16) { + FC_XPU_KERNEL_IMPL(float, int8_t, dtype::float16, int8_t); } else { PADDLE_THROW(phi::errors::Unimplemented( "Not support x_dtype is %s, w_dtype is %s and out_dtype is " From 3ab34c63594ee6eeaa5c6db1e1d66bc122bdae14 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Mon, 9 Oct 2023 16:11:50 +0800 Subject: [PATCH 03/15] support quantize of pass --- paddle/fluid/framework/ir/CMakeLists.txt | 8 +- .../ir/delete_quant_dequant_linear_op_pass.cc | 12 + .../ir/quantize_related_pass_utils.h | 84 +++++ paddle/fluid/framework/ir/xpu/pass_utils.h | 18 + .../ir/xpu/xpu_graph_pattern_detector.cc | 128 +++++++ .../ir/xpu/xpu_graph_pattern_detector.h | 96 ++++++ .../framework/ir/xpu/xpu_quantize_op_pass.cc | 275 +++++++++++++++ .../framework/ir/xpu/xpu_quantize_op_pass.h | 65 ++++ .../ir/xpu/xpu_quantize_squash_pass.cc | 312 ++++++++++++++++++ .../ir/xpu/xpu_quantize_squash_pass.h | 110 ++++++ .../inference/api/paddle_pass_builder.cc | 6 +- paddle/phi/api/yaml/ops.yaml | 20 ++ paddle/phi/backends/xpu/xpu2_op_list.cc | 4 + paddle/phi/infermeta/binary.cc | 20 ++ paddle/phi/infermeta/binary.h | 12 + .../phi/kernels/xpu/dequantization_kernel.cc | 66 ++++ paddle/phi/kernels/xpu/quantization_kernel.cc | 70 ++++ 17 files changed, 1303 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/framework/ir/quantize_related_pass_utils.h create mode 100644 paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc create mode 100644 paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h create mode 100644 paddle/phi/kernels/xpu/dequantization_kernel.cc create mode 100644 paddle/phi/kernels/xpu/quantization_kernel.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index e9a8e4cc22cac..42e9a1267e0ee 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -237,7 +237,11 @@ if(WITH_XPU) xpu_pass_utils SRCS xpu/pass_utils.cc DEPS pass xpu_quant_utils) - set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils) + cc_library( + xpu_graph_pattern_detector + SRCS xpu/xpu_graph_pattern_detector.cc + DEPS graph_pattern_detector) + set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector) pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) @@ -247,6 +251,8 @@ if(WITH_XPU) # pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(xpu_quantize_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) + pass_library(xpu_quantize_squash_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index 42c7f7acdc103..9245305889907 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -19,6 +19,7 @@ #include #include #include +#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" namespace paddle { namespace framework { @@ -94,6 +95,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { scope, platform::errors::InvalidArgument( "Scope in DeleteQuantDequantLinearOpPass should not be null.")); + std::unordered_map> var_quant_scales{}; + // Create pattern patterns::DeleteQuantDequantLinearOpPattern pattern(gpd.mutable_pattern(), pattern_name); @@ -146,6 +149,11 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { any_op_desc->SetAttr( "Input_bit_length_" + quantize_linear_op_x->Var()->Name(), bit_length); + if (!var_quant_scales.count(quantize_linear_op_x->Var()->Name())) { + var_quant_scales.insert( + std::make_pair(quantize_linear_op_x->Var()->Name(), + std::vector({input_scale}))); + } // link x to any_op2 any_op_desc->RenameInput(dequantize_linear_op_out->Var()->Name(), quantize_linear_op_x->Var()->Name()); @@ -165,6 +173,10 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(found_count); + + // save var_quant_scales in the temporary save op's attr01 + SaveInfoInTheTmpOp( + graph, "has_quant_info", "var_quant_scales", var_quant_scales); } } // namespace ir diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h new file mode 100644 index 0000000000000..ce97cdd5fee33 --- /dev/null +++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h @@ -0,0 +1,84 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +static void SaveInfoInTheTmpOp( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const std::unordered_map>& info_map) { + VLOG(3) << "save variables in the first op's attr"; + + const std::string suffix = "_" + key_suffix + "_" + flag; + OpDesc op_desc; + op_desc.SetType("save"); + auto* op_node = graph->CreateOpNode(&op_desc); + + op_node->Op()->SetAttr(flag, true); + for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { + op_node->Op()->SetAttr(iter->first + suffix, iter->second); + } +} + +static void GetInfoFromTheTmpOp( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + std::unordered_map>* info_map) { + VLOG(3) << "get variables from the first op's attr"; + + const std::string suffix = "_" + key_suffix + "_" + flag; + for (auto* op_node : + ir::TopologyVarientSort(*graph, static_cast(0))) { + if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue; + VLOG(5) << "Come in save op"; + auto* op_desc = op_node->Op(); + if (op_desc->GetAttrIfExists(flag)) { + VLOG(5) << "flag is true"; + op_desc->RemoveAttr(flag); + std::vector attr_names = op_desc->AttrNames(); + for (auto fake_name : attr_names) { + VLOG(5) << "fake_name:" << fake_name; + size_t pos = fake_name.find(suffix); + if (pos != std::string::npos) { + std::string name = fake_name.substr(0, pos); + VLOG(5) << "name:" << name; + auto scales_vector = + PADDLE_GET_CONST(std::vector, op_desc->GetAttr(fake_name)); + VLOG(5) << "scales_vector:" << scales_vector[0]; + info_map->insert(std::make_pair(name, scales_vector)); + VLOG(5) << "insert success:"; + op_desc->RemoveAttr(fake_name); + VLOG(5) << "remove success:"; + } + } + graph->RemoveNode(op_node); + VLOG(5) << "remove op node success:"; + break; + } + } +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h index 556a14fa0e9e4..417ba361e4348 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.h +++ b/paddle/fluid/framework/ir/xpu/pass_utils.h @@ -101,6 +101,24 @@ void PrepareWeight(Graph* graph, void PrepareBias( Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst); +inline std::string FindOutputNameByVarName(framework::OpDesc* op, + const std::string& searched_name) { + std::string ret; + for (const auto& name : op->OutputNames()) + for (const auto& output_name : op->Output(name)) + if (output_name == searched_name) ret = name; + return ret; +} + +inline std::string FindInputNameByVarName(framework::OpDesc* op, + const std::string& searched_name) { + std::string ret; + for (const auto& name : op->InputNames()) + for (const auto& input_name : op->Input(name)) + if (input_name == searched_name) ret = name; + return ret; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc new file mode 100644 index 0000000000000..f74f9c8289d65 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc @@ -0,0 +1,128 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { +PDNode *patterns::DequantXPUAny::operator()() { + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize_xpu", "y"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksTo({dequant_out}); + next_op->LinksFrom({dequant_out}); + + return dequant_out; +} + +PDNode *patterns::QuantXPUAny::operator()() { + auto *quant_in = pattern->NewNode(quant_in_repr()) + ->AsInput() + ->assert_is_op_input("quantize_xpu", "x"); + auto *quant_op = + pattern->NewNode(quant_op_repr())->assert_is_op("quantize_xpu"); + + auto *quant_out = pattern->NewNode(quant_out_repr()) + ->AsOutput() + ->assert_is_op_output("quantize_xpu", "y"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + quant_op->LinksFrom({quant_in}).LinksTo({quant_out}); + next_op->LinksFrom({quant_out}); + + return quant_out; +} + +PDNode *patterns::DequantQuantXPUAny::operator()() { + auto *dequant_in = pattern->NewNode(dequant_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize_xpu", "x"); + auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize_xpu", "max"); + + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu"); + + auto *dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize_xpu", "y"); + + auto *quant_max_in = pattern->NewNode(quant_max_in_repr()) + ->assert_is_op_input("quantize_xpu", "max"); + + auto *quant_op = pattern->NewNode(quant_op_repr()) + ->assert_is_op("quantize_xpu") + ->AsIntermediate(); + + auto *quant_out = pattern->NewNode(quant_out_repr()) + ->AsOutput() + ->assert_is_op_output("quantize_xpu"); + + auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); + + dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out}); + quant_op->LinksFrom({dequant_out, quant_max_in}).LinksTo({quant_out}); + next_op->LinksFrom({quant_out}); + + return quant_out; +} + +PDNode *patterns::OpDequantXPU::operator()() { + auto any_op = pattern->NewNode(any_op_repr())->assert_is_op(); + auto *dequant_in = pattern->NewNode(dequant_in_repr()) + ->assert_is_op_input("dequantize_xpu", "x"); + + auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr()) + ->AsInput() + ->assert_is_op_input("dequantize_xpu", "max"); + auto *dequant_op = + pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu"); + auto dequant_out = pattern->NewNode(dequant_out_repr()) + ->AsOutput() + ->assert_is_op_output("dequantize_xpu", "y"); + + any_op->LinksTo({dequant_in}); + dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out}); + return dequant_out; +} + +PDNode *patterns::MultipleQuantizeXPU::operator()() { + auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput(); + + // find nodes that are inputs to quantize operators + prev_out->assert_more([&](Node *node) { + int counter = static_cast(std::count_if( + node->outputs.begin(), node->outputs.end(), [&](Node const *iter) { + return iter && iter->IsOp() && iter->Op()->Type() == "quantize_xpu"; + })); + return (counter > 1); + }); + + return prev_out; +} + +} // namespace patterns +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h new file mode 100644 index 0000000000000..c849b2a24bb48 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h @@ -0,0 +1,96 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { +namespace patterns { + +// Dequantize + anyOP +// This quantize is used for getting number of ops the Dequantize's +// output is an input to. +struct DequantXPUAny : public PatternBase { + DequantXPUAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_xpu_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Quantize + anyOP +struct QuantXPUAny : public PatternBase { + QuantXPUAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "quant_xpu_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(quant_in); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Dequantize + Quantize + anyOP +// This pattern is used for squashing the dequantize-quantize pairs. +struct DequantQuantXPUAny : public PatternBase { + DequantQuantXPUAny(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "dequant_quant_xpu_any") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(dequant_in); + PATTERN_DECL_NODE(dequant_max_in); + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); + PATTERN_DECL_NODE(quant_max_in); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); + PATTERN_DECL_NODE(next_op); +}; + +// Op + Dequant +// named nodes: +// any_op, dequant_in +// dequant_op, dequant_out +struct OpDequantXPU : public PatternBase { + OpDequantXPU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "op_dequant_xpu") {} + + PDNode* operator()(); + + PATTERN_DECL_NODE(any_op); + PATTERN_DECL_NODE(dequant_in); + PATTERN_DECL_NODE(dequant_max_in); + PATTERN_DECL_NODE(dequant_op); + PATTERN_DECL_NODE(dequant_out); +}; + +// anyOp + more then one quantize op +// This pattern is used for squashing multiple quantize with the same scale. +struct MultipleQuantizeXPU : public PatternBase { + MultipleQuantizeXPU(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "multiple_quantize_xpu") {} + PDNode* operator()(); + + PATTERN_DECL_NODE(prev_out); +}; + +} // namespace patterns +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc new file mode 100644 index 0000000000000..a8fc7102a8d88 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -0,0 +1,275 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h" + +#include +#include +#include + +#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/utils/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +static void UnlinkNodes(ir::Node* a, ir::Node* b) { + a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b), + a->outputs.end()); + b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a), + b->inputs.end()); +} + +void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const { + GetInfoFromTheTmpOp( + graph, + "has_quant_info", + "var_quant_scales", + const_cast>*>( + &var_quant_scales_)); +} + +bool XPUQuantizeOpPass::AreScalesPresentForNodes( + std::initializer_list nodes) const { + bool present = true; + for (auto node : nodes) { + if (var_quant_scales_.count(node->Name()) == 0) { + present = false; + } + } + return present; +} + +float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const { + return var_quant_scales_.at(node->Name())[0]; +} + +void XPUQuantizeOpPass::QuantizeInput(Graph* g, + Node* op, + Node* input, + std::string input_arg_name) const { + auto* scope = param_scope(); + auto inputs = op->Op()->InputNames(); + bool name_found = + std::find(inputs.begin(), inputs.end(), input_arg_name) != inputs.end(); + PADDLE_ENFORCE_EQ(name_found, + true, + platform::errors::InvalidArgument( + "Var(%s) isn't the input of the %s operator.", + input_arg_name, + op->Op()->Type())); + + // Create quantize output variable + VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out")); + auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc); + quantize_out_node->Var()->SetDataType( + proto::VarType::Type::VarType_Type_INT8); + // Create quantize max_ptr node + + float scale = GetScaleValueForNode(input); + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + std::string input_max_name = input->Name() + "_quantize_max"; + VarDesc input_max_desc(input_max_name); + input_max_desc.SetPersistable( + true); // Need depends on ir_params_sync_among_devices_pass copy to xpu + // device + input_max_desc.SetShape({static_cast(max_ptr_size)}); + input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + Node* input_max_node = g->CreateVarNode(&input_max_desc); + auto input_max_tensor = + scope->Var(input_max_name)->GetMutable(); + input_max_tensor->set_type(phi::DataType::FLOAT32); + input_max_tensor->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + std::vector input_scales(max_ptr_size, scale); + memcpy(cpu_ctx->Alloc(input_max_tensor), + input_scales.data(), + max_ptr_size * sizeof(float)); + + // create a quantize op node + OpDesc q_desc; + q_desc.SetType("quantize_xpu"); + q_desc.SetInput("x", std::vector({input->Name()})); + q_desc.SetInput("max", std::vector({input_max_name})); + q_desc.SetOutput("y", std::vector({quantize_out_node->Name()})); + q_desc.SetAttr("out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + q_desc.SetAttr("scale", static_cast(scale)); + + auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. + // update op's input + op->Op()->SetInput(input_arg_name, + std::vector({quantize_out_node->Name()})); + // link quantize op + UnlinkNodes(input, op); + IR_NODE_LINK_TO(input, quantize_op); + IR_NODE_LINK_TO(input_max_node, quantize_op); + IR_NODE_LINK_TO(quantize_op, quantize_out_node); + IR_NODE_LINK_TO(quantize_out_node, op); +} + +void XPUQuantizeOpPass::DequantizeOutput(Graph* g, + Node* op, + Node* output, + std::string output_arg_name) const { + auto* scope = param_scope(); + auto outputs = op->Op()->OutputNames(); + bool name_found = + std::find(outputs.begin(), outputs.end(), output_arg_name) != + outputs.end(); + PADDLE_ENFORCE_EQ(name_found, + true, + platform::errors::InvalidArgument( + "Var(%s) isn't the output of the %s operator.", + output_arg_name, + op->Op()->Type())); + + // Create dequantize input variable + VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in")); + auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc); + dequantize_in_node->Var()->SetDataType( + proto::VarType::Type::VarType_Type_INT8); + + // Create dequantize max_ptr node + float scale = GetScaleValueForNode(output); + int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); + std::string input_max_name = output->Name() + "_dequantize_max"; + VarDesc input_max_desc(input_max_name); + input_max_desc.SetPersistable( + true); // Need depends on ir_params_sync_among_devices_pass copy to xpu + // device + input_max_desc.SetShape({static_cast(max_ptr_size)}); + input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + Node* input_max_node = g->CreateVarNode(&input_max_desc); + auto input_max_tensor = + scope->Var(input_max_name)->GetMutable(); + input_max_tensor->set_type(phi::DataType::FLOAT32); + input_max_tensor->Resize({max_ptr_size}); + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + std::vector input_scales(max_ptr_size, scale); + memcpy(cpu_ctx->Alloc(input_max_tensor), + input_scales.data(), + max_ptr_size * sizeof(float)); + + // create a quantize op node + OpDesc deq_desc; + deq_desc.SetType("dequantize_xpu"); + deq_desc.SetInput("x", + std::vector({dequantize_in_node->Name()})); + deq_desc.SetInput("max", std::vector({input_max_name})); + deq_desc.SetOutput("y", std::vector({output->Name()})); + deq_desc.SetAttr("out_dtype", static_cast(output->Var()->GetDataType())); + deq_desc.SetAttr("scale", static_cast(scale)); + + auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. + // update op's input + op->Op()->SetOutput(output_arg_name, + std::vector({dequantize_in_node->Name()})); + // link dequantize op + UnlinkNodes(op, output); + IR_NODE_LINK_TO(op, dequantize_in_node); + IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); + IR_NODE_LINK_TO(input_max_node, dequantize_op); + IR_NODE_LINK_TO(dequantize_op, output); +} + +void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { + for (auto* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->Type() != "conv2d_xpu") { + continue; + } + Node* w_var_node = nullptr; + Node* x_var_node = nullptr; + Node* out_var_node = nullptr; + Node* branch_var_node = nullptr; + + for (auto* input_node : n->inputs) { + if (!input_node->IsVar()) { + continue; + } + if (input_node->Var()->Name() == op->Input("x")[0]) { + x_var_node = input_node; + } else if (input_node->Var()->Name() == op->Input("filter")[0]) { + w_var_node = input_node; + } else if (op->HasInput("branch") && + input_node->Var()->Name() == op->Input("branch")[0]) { + branch_var_node = input_node; + } + } + + for (auto* output_node : n->outputs) { + if (!output_node->IsVar()) { + continue; + } + if (output_node->Var()->Name() == op->Output("out")[0]) { + out_var_node = output_node; + } + } + if (!AreScalesPresentForNodes({x_var_node})) { + // MarkAndLogCannotQuantizeOp(conv_op, + // "No scale available for the operator"); + return; + } + + QuantizeInput(graph, n, x_var_node, "x"); + // Branch input + if (branch_var_node != nullptr) { + if (AreScalesPresentForNodes({branch_var_node})) { + QuantizeInput(graph, n, branch_var_node, "branch"); + } else { + n->Op()->SetAttr("xpu_op_force_output_precision", + branch_var_node->Var()->GetDataType()); + } + } + + auto has_output_scale = AreScalesPresentForNodes({out_var_node}); + if (has_output_scale) { + DequantizeOutput(graph, n, out_var_node, "out"); + n->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + } else { + n->Op()->SetAttr("xpu_op_force_output_precision", + x_var_node->Var()->GetDataType()); + n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); + } + } + } +} + +void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const { + VLOG(3) << "Insert quantize/dequantize op to the graph."; + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + FusePassBase::Init(name_scope_, graph); + PADDLE_ENFORCE_NOT_NULL( + param_scope(), + platform::errors::InvalidArgument("Scope cannot be nullptr.")); + + GetQuantInfo(graph); + VLOG(1) << "Get quant info from graph success."; + QuantizeConv(graph); + VLOG(1) << "Quantize conv of the graph success."; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(xpu_quantize_op_pass, paddle::framework::ir::XPUQuantizeOpPass); diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h new file mode 100644 index 0000000000000..0b74682009351 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h @@ -0,0 +1,65 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Quantize all supported operators. + */ +class XPUQuantizeOpPass : public FusePassBase { + public: + virtual ~XPUQuantizeOpPass() {} + + protected: + void ApplyImpl(Graph* graph) const override; + void QuantizeConv(Graph* graph) const; + + private: + void QuantizeInput(Graph* g, + Node* op, + Node* input, + std::string input_arg_name) const; + + void DequantizeOutput(Graph* g, + Node* op, + Node* output, + std::string output_arg_name) const; + + void GetQuantInfo(Graph* graph) const; + + bool AreScalesPresentForNodes(std::initializer_list nodes) const; + + float GetScaleValueForNode(Node* node) const; + + std::unordered_map> var_quant_scales_; + const std::string name_scope_{"xpu_quantize_op_pass"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc new file mode 100644 index 0000000000000..8571dee220d3b --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc @@ -0,0 +1,312 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file eint8_outcept in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or +// implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h" + +#include +#include + +#include "paddle/fluid/framework/ir/xpu/pass_utils.h" +#include "paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h" +#include "paddle/phi/core/enforce.h" +#include "paddle/utils/string/pretty_log.h" + +namespace paddle { +namespace framework { +namespace ir { + +using string::PrettyLogDetail; + +XPUQuantizeSquashPass::XPUQuantizeSquashPass() {} + +void XPUQuantizeSquashPass::FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + patterns::DequantXPUAny deq_any_pattern{gpd.mutable_pattern(), + "dequant_xpu_any"}; + deq_any_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern); + + if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end()) + (*nodes_keep_counter)[dequant_out] = 1; + else + (*nodes_keep_counter)[dequant_out] += 1; + + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +void XPUQuantizeSquashPass::DequantQuantSquash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const { + GraphPatternDetector gpd; + LOG(INFO) << "DequantQuantSquash COME IN"; + patterns::DequantQuantXPUAny squash_pattern{gpd.mutable_pattern(), + "dequant_quant_xpu_any"}; + squash_pattern(); + + int found_dequant_quant_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + LOG(INFO) << "squash dequantize-quantize ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern); + + auto* next_op_desc = next_op->Op(); + float dequant_scale = + PADDLE_GET_CONST(float, dequant_op->Op()->GetAttr("scale")); + float quant_scale = + PADDLE_GET_CONST(float, quant_op->Op()->GetAttr("scale")); + + PADDLE_ENFORCE_NE( + nodes_keep_counter->find(dequant_out), + nodes_keep_counter->end(), + platform::errors::NotFound("The dequant output node is not found.")); + + // check if dequantize op should be kept or removed, decrease the counter + bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; + + int equal = dequant_scale == quant_scale ? 1 : 0; + if (dequant_scale == quant_scale || isnan(dequant_scale) || + isnan(quant_scale) || isinf(dequant_scale) || isinf(quant_scale)) { + // squash dequantize-quantize to nothing + + auto quant_out_var_name = quant_out->Name(); + for (auto input_name : next_op_desc->InputNames()) { + auto& input_names = next_op_desc->MutableInputs()->at(input_name); + std::replace(input_names.begin(), + input_names.end(), + quant_out_var_name, + dequant_in->Name()); + next_op_desc->SetInput(input_name, input_names); + } + + if (keep_dequant) + GraphSafeRemoveNodes(graph, {quant_op, quant_out}); + else + GraphSafeRemoveNodes(graph, + {dequant_op, quant_op, dequant_out, quant_out}); + + IR_NODE_LINK_TO(dequant_in, next_op); + + found_dequant_quant_count++; + } else { + // squash dequantize-quantize to requantize op + // OpDesc desc; + // desc.SetType("requantize"); + // desc.SetInput("Input", + // std::vector({dequant_in->Name()})); + // desc.SetOutput("Output", + // std::vector({quant_out->Name()})); + // desc.SetAttr("Scale_in", dequant_scale); + // desc.SetAttr("Shift_in", dequant_shift); + // desc.SetAttr("Scale_out", quant_scale); + // desc.SetAttr("Shift_out", quant_shift); + + // auto requant_op = g->CreateOpNode(&desc); + + // if (keep_dequant) + // GraphSafeRemoveNodes(graph, {quant_op}); + // else + // GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out}); + + // IR_NODE_LINK_TO(dequant_in, requant_op); + // IR_NODE_LINK_TO(requant_op, quant_out); + + // found_dequant_quant_count++; + } + }; + gpd(graph, handler); + AddStatis(found_dequant_quant_count); + PrettyLogDetail("--- squashed %d dequantize-quantize pairs", + found_dequant_quant_count); +} + +void XPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const { + GraphPatternDetector gpd; + patterns::OpDequantXPU op_dequant_pattern{gpd.mutable_pattern(), + "op_dequant_xpu"}; + op_dequant_pattern(); + + int found_op_dequant_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash op-dequant ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(any_op, any_op, op_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, op_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, op_dequant_pattern); + GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, op_dequant_pattern); + + if (dequant_in->outputs.size() == 1) { + // Find the name of the output linking any_op to dequant_in + std::string output_name = + FindOutputNameByVarName(any_op->Op(), dequant_in->Name()); + + if (output_name.empty()) return; + any_op->Op()->SetAttr("out_dtype", dequant_out->Var()->GetDataType()); + any_op->Op()->SetOutput(output_name, + std::vector({dequant_out->Name()})); + IR_NODE_LINK_TO(any_op, dequant_out); + GraphSafeRemoveNodes(graph, {dequant_in, dequant_op}); + found_op_dequant_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_op_dequant_squash_count); + PrettyLogDetail("--- squashed %d dequant with ops", + found_op_dequant_squash_count); +} + +// conv2d_xpu, fc_xpu +void XPUQuantizeSquashPass::QuantOpSquash(Graph* graph) const { + GraphPatternDetector gpd; + patterns::QuantXPUAny quant_any_pattern{gpd.mutable_pattern(), + "quant_xpu_any"}; + quant_any_pattern(); + + int found_quant_op_squash_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "squash op-dequant ops pair"; + + GET_IR_NODE_FROM_SUBGRAPH(quant_in, quant_in, quant_any_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, quant_any_pattern); + GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, quant_any_pattern); + GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, quant_any_pattern); + + if (quant_out->outputs.size() == 1) { + std::string input_name = + FindInputNameByVarName(next_op->Op(), quant_out->Name()); + + if (input_name.empty()) return; + // Only support quant + conv2d_xpu/fc_xpu fusion + if (!(next_op->Op()->Type() == "conv2d_xpu" || + next_op->Op()->Type() == "fc_xpu")) { + return; + } + next_op->Op()->SetInput(input_name, + std::vector({quant_in->Name()})); + IR_NODE_LINK_TO(quant_in, next_op); + GraphSafeRemoveNodes(graph, {quant_out, quant_op}); + found_quant_op_squash_count++; + } + }; + gpd(graph, handler); + AddStatis(found_quant_op_squash_count); + PrettyLogDetail("--- squashed %d quantize with ops", + found_quant_op_squash_count); +} + +void XPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const { + GraphPatternDetector gpd; + patterns::MultipleQuantizeXPU multiple_quantize_pattern{ + gpd.mutable_pattern(), "multiple_quantize_xpu"}; + multiple_quantize_pattern(); + + int found_multiple_quantize_squash_count = 0; + int removed_quantize = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "fuse multiple quantize ops"; + + GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, multiple_quantize_pattern); + + auto* first_quant_op = *(std::find_if( + prev_out->outputs.begin(), prev_out->outputs.end(), [&](Node* node) { + return (node->IsOp() && node->Op()->Type() == "quantize_xpu"); + })); + auto* first_quant_out = first_quant_op->outputs[0]; + float scale = first_quant_op->Op()->GetAttrIfExists("scale"); + + PADDLE_ENFORCE_NE(scale, + 0, + platform::errors::InvalidArgument( + "Quantize scale(%f) should not be equal 0.", scale)); + + for (int iter = prev_out->outputs.size() - 1; iter >= 0; iter--) { + auto quant_op = prev_out->outputs[iter]; + if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize_xpu" && + quant_op->id() != first_quant_op->id() && + quant_op->Op()->GetAttrIfExists("scale") == scale) { + auto quant_out = quant_op->outputs[0]; + auto last_op = quant_out->outputs[0]; + auto last_op_op = last_op->Op(); + + std::string last_op_input_name = + FindInputNameByVarName(last_op_op, quant_out->Name()); + + PADDLE_ENFORCE_NE( + last_op_input_name.empty(), + true, + platform::errors::NotFound("Operator after quantize operator(%s) " + "should have quantize output as input.", + quant_out->Name())); + + // update the next operator input, + // by replacing quant_out with first_quant_out + auto last_op_names = last_op->Op()->Inputs().at(last_op_input_name); + std::replace(last_op_names.begin(), + last_op_names.end(), + quant_out->Name(), + first_quant_out->Name()); + last_op_op->SetInput(last_op_input_name, + std::vector(last_op_names)); + + IR_NODE_LINK_TO(first_quant_out, last_op); + GraphSafeRemoveNodes(graph, {quant_op, quant_out}); + removed_quantize++; + } + } + found_multiple_quantize_squash_count++; + }; + gpd(graph, handler); + AddStatis(found_multiple_quantize_squash_count); + PrettyLogDetail("--- squashed %d quantize op", removed_quantize); +} + +void XPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, + platform::errors::InvalidArgument( + "The graph in function XPUQuantizeSquashPass::ApplyImpl is null.")); + FusePassBase::Init("xpu_quantize_squash_pass", graph); + + std::unordered_map nodes_keep_counter; + FindNodesToKeep(graph, &nodes_keep_counter); + DequantQuantSquash(graph, &nodes_keep_counter); + OpDequantSquash(graph); + // QuantOpSquash(graph); + MultipleQuantizeSquash(graph); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(xpu_quantize_squash_pass, + paddle::framework::ir::XPUQuantizeSquashPass); diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h new file mode 100644 index 0000000000000..fbfa967791304 --- /dev/null +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h @@ -0,0 +1,110 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" + +namespace paddle { +namespace framework { +namespace ir { + +/* + * Squash dequantize->quantize pair pattern into requantize op + */ + +class XPUQuantizeSquashPass : public FusePassBase { + public: + XPUQuantizeSquashPass(); + virtual ~XPUQuantizeSquashPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; + + /* + * For each dequantize's output find the number of operators it is an input to + */ + void FindNodesToKeep( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Don't squash unsigned dequantize with signed quantize. + * This is important for concat and elementwise ops. + * When inputs have different sign, concat will assume signed type and + * elementwise assumes first input type. + */ + bool IsDequantizeQuantizeIncompatible(Node* quant_op, + Node* dequant_op, + Node* next_op) const; + + /* + * Squash dequantize-quantize ops pairs into requantize or nothing + */ + void DequantQuantSquash( + Graph* graph, + std::unordered_map* nodes_keep_counter) const; + + /* + * Squash requantize op into conv with scale_out like requantize scale_out + */ + void OpRequantSquash(Graph* graph) const; + + /* + * Squash requantize op if the next operator's input scale can be updated + */ + void RequantOpSquash(Graph* graph) const; + + /* + * Squash dequant if the previous operator has force_fp32_output attribute + */ + void OpDequantSquash(Graph* graph) const; + + /* + * Squash quantize if several quatize ops have the same scale + */ + void MultipleQuantizeSquash(Graph* graph) const; + + /* + * Squash scale if dequantize is before scale + */ + void DequantScaleSquash(Graph* graph) const; + + /* + * Squash scale if scale is before quantize + */ + void ScaleQuantSquash(Graph* graph) const; + + /* + * Squash quantize if is before bfloat16 conv2d or fused_conv2d + */ + void QuantizeBf16Conv(Graph* graph) const; + + void QuantizeBf16ConvImpl(Graph* graph, const std::string& conv_type) const; + + /* + * Squash quantize if is before conv2d_xpu/fc_xpuy + */ + void QuantOpSquash(Graph* graph) const; + + const std::string name_scope_{"squash"}; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 65fd8a74aa101..41d2ccd67b43a 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -561,10 +561,12 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "fast_where_xpu_fuse_pass", "elementwise_mul_add_fuse_pass", "link_xpu_op_max_pass", - "delete_isolated_node_pass", // "auto_mixed_precision_pass", "cast_mixed_precision_op_fuse_pass", - "auto_trans_quantize_op_precision_pass", + "xpu_quantize_op_pass", + "xpu_quantize_squash_pass", + // "auto_trans_quantize_op_precision_pass", + "delete_isolated_node_pass", "inplace_op_var_pass", }); use_xpu_ = true; diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 4e67144ba8a89..14188449b2def 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -657,6 +657,16 @@ func : depthwise_conv2d backward : depthwise_conv2d_grad +- op : dequantize_xpu + args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f) + output : Tensor(y) + infer_meta : + func : DeQuantizeXPUInferMeta + kernel : + func : dequantize_xpu + data_type: x + optional : max + - op : det args : (Tensor x) output : Tensor @@ -2017,6 +2027,16 @@ func : qr backward : qr_grad +- op : quantize_xpu + args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f) + output : Tensor(y) + infer_meta : + func : QuantizeXPUInferMeta + kernel : + func : quantize_xpu + data_type : x + optional : max + - op : real args : (Tensor x) output : Tensor (out) diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index f8139af52cb22..a4109197521a5 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -212,6 +212,8 @@ XPUOpMap& get_kl2_ops() { XPUKernelSet({phi::DataType::FLOAT32})}, {"depthwise_conv2d_transpose", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, + {"dequantize_xpu", + XPUKernelSet({phi::DataType::INT16, phi::DataType::INT8})}, {"diag_v2", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16, @@ -615,6 +617,8 @@ XPUOpMap& get_kl2_ops() { {"prelu_grad", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"prod_raw", XPUKernelSet({phi::DataType::FLOAT32})}, + {"quantize_xpu", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"range", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::INT64, diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index 2fd87760378fc..5b31803127eb3 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -978,6 +978,16 @@ void DepthwiseConvInferMeta(const MetaTensor& input, config); } +void DeQuantizeXPUInferMeta(const MetaTensor& x, + const MetaTensor& max, + DataType out_dtype, + float scale, + MetaTensor* y) { + auto x_dims = x.dims(); + y->set_dims(x_dims); + y->set_dtype(out_dtype); +} + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -2597,6 +2607,16 @@ void PriorBoxInferMeta(const MetaTensor& input, var->set_dims(phi::make_ddim(dim_vec)); } +void QuantizeXPUInferMeta(const MetaTensor& x, + const MetaTensor& max, + DataType out_dtype, + float scale, + MetaTensor* y) { + auto x_dims = x.dims(); + y->set_dims(x_dims); + y->set_dtype(out_dtype); +} + void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x, const MetaTensor& repeats, int dim, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 94d8bb606ea5d..aa469554b6fd4 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -155,6 +155,12 @@ void DepthwiseConvInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); +void DeQuantizeXPUInferMeta(const MetaTensor& x, + const MetaTensor& max, + DataType out_dtype, + float scale, + MetaTensor* y); + void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -408,6 +414,12 @@ void PriorBoxInferMeta(const MetaTensor& input, MetaTensor* out, MetaTensor* var); +void QuantizeXPUInferMeta(const MetaTensor& x, + const MetaTensor& max, + DataType out_dtype, + float scale, + MetaTensor* y); + void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc new file mode 100644 index 0000000000000..20423c1eb8920 --- /dev/null +++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc @@ -0,0 +1,66 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void DeQuantizeKernelImpl(const Context& ctx, + const DenseTensor& x, + const paddle::optional& max, + DenseTensor* y) { + using XPUInX = typename XPUTypeTrait::Type; + using XPUOutY = typename XPUTypeTrait::Type; + + auto* y_data = ctx.template Alloc(y); + const auto* x_data = x.data(); + int64_t len = x.numel(); + const float* max_data = + max.get_ptr() == nullptr ? nullptr : max->data(); + int r = xpu::dequantization( + ctx.x_context(), + reinterpret_cast(x_data), + reinterpret_cast(y_data), + len, + max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "dequantization"); +} + +template +void DeQuantizeKernel(const Context& ctx, + const DenseTensor& x, + const paddle::optional& max, + DataType out_dtype, + float scale, + DenseTensor* y) { + switch (out_dtype) { + case DataType::FLOAT32: + DeQuantizeKernelImpl(ctx, x, max, y); + break; + case DataType::FLOAT16: + DeQuantizeKernelImpl(ctx, x, max, y); + break; + default: + PADDLE_THROW(phi::errors::Unavailable( + "Not supported Quantize data type from %d -> %d ", + x.dtype(), + out_dtype)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + dequantize_xpu, XPU, ALL_LAYOUT, phi::DeQuantizeKernel, int16_t, int8_t) {} diff --git a/paddle/phi/kernels/xpu/quantization_kernel.cc b/paddle/phi/kernels/xpu/quantization_kernel.cc new file mode 100644 index 0000000000000..01f6ddad93aa0 --- /dev/null +++ b/paddle/phi/kernels/xpu/quantization_kernel.cc @@ -0,0 +1,70 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/xpu/enforce_xpu.h" +#include "paddle/phi/core/kernel_registry.h" + +namespace phi { +template +void QuantizeKernelImpl(const Context& ctx, + const DenseTensor& x, + const paddle::optional& max, + DenseTensor* y) { + using XPUInX = typename XPUTypeTrait::Type; + using XPUOutY = typename XPUTypeTrait::Type; + + auto* y_data = ctx.template Alloc(y); + const auto* x_data = x.data(); + int64_t len = x.numel(); + const float* max_data = + max.get_ptr() == nullptr ? nullptr : max->data(); + int r = xpu::quantization( + ctx.x_context(), + reinterpret_cast(x_data), + reinterpret_cast(y_data), + len, + max_data); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); +} + +template +void QuantizeKernel(const Context& ctx, + const DenseTensor& x, + const paddle::optional& max, + DataType out_dtype, + float scale, + DenseTensor* y) { + switch (out_dtype) { + case DataType::INT16: + QuantizeKernelImpl(ctx, x, max, y); + break; + case DataType::INT8: + QuantizeKernelImpl(ctx, x, max, y); + break; + default: + PADDLE_THROW(phi::errors::Unavailable( + "Not supported quantize data type from %d -> %d ", + x.dtype(), + out_dtype)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL(quantize_xpu, + XPU, + ALL_LAYOUT, + phi::QuantizeKernel, + float, + phi::dtype::float16) {} From da5cb07f2e95c451378043989db64b570a990030 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Thu, 19 Oct 2023 14:26:50 +0800 Subject: [PATCH 04/15] support fp16 fix --- .../framework/ir/auto_mixed_precision_pass.cc | 11 +- .../ir/delete_quant_dequant_linear_op_pass.cc | 10 +- .../delete_weight_dequant_linear_op_pass.cc | 28 +-- paddle/fluid/framework/ir/graph.h | 21 ++ .../ir/quantize_related_pass_utils.h | 103 +++++++++- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 184 ++++++++---------- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 105 +++++----- .../framework/ir/xpu/link_xpu_op_max_pass.cc | 14 +- .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc | 29 +-- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 112 ++++++++--- .../framework/ir/xpu/xpu_quantize_op_pass.h | 7 +- .../passes/convert_to_mixed_precision.cc | 4 - 12 files changed, 379 insertions(+), 249 deletions(-) diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index 497dcae8395d5..fe5ec348bf707 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -524,7 +524,6 @@ void AutoMixedPrecisionPass::UpdateOpPrecision() const { vars_should_not_low_precision.insert(in_var_node->Var()->Name()); } } - // when op_1 only support cpu kernel. if op_2's intput var is op_1's // output var, then op_2 should not run at low precision. if (GetOpOriginalType(op_type) != "feed" && @@ -688,6 +687,16 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert( if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { return true; } + } else if (GetOpOriginalType(op_desc->Type()) == "quantize_linear" || + GetOpOriginalType(op_desc->Type()) == "dequantize_linear") { + auto vecs = op_desc->Input("Scale"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } + vecs = op_desc->Input("ZeroPoint"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } } } diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index 9245305889907..0a9fc07a7cb07 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -91,6 +91,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { GraphPatternDetector gpd; auto* scope = param_scope(); + BlockDesc* block = nullptr; PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument( @@ -113,6 +114,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { return; } */ + block = quantize_linear_op->Op()->Block(); std::unordered_set nodes2rm = {}; // Get input scale from tensor @@ -140,15 +142,10 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { int nums_any_ops = static_cast(dequantize_linear_op_out->outputs.size()); - int bit_length = - PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length")); for (int i = 0; i < nums_any_ops; ++i) { auto* any_op_desc = dequantize_linear_op_out->outputs[i]->Op(); any_op_desc->SetAttr("Input_scale_" + quantize_linear_op_x->Var()->Name(), input_scale); - any_op_desc->SetAttr( - "Input_bit_length_" + quantize_linear_op_x->Var()->Name(), - bit_length); if (!var_quant_scales.count(quantize_linear_op_x->Var()->Name())) { var_quant_scales.insert( std::make_pair(quantize_linear_op_x->Var()->Name(), @@ -174,8 +171,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { gpd(graph, handler); AddStatis(found_count); - // save var_quant_scales in the temporary save op's attr01 - SaveInfoInTheTmpOp( + SaveQuantInfoInTheGraph( graph, "has_quant_info", "var_quant_scales", var_quant_scales); } diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index 0140fb664b1de..968120068b92a 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -35,7 +35,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { true, platform::errors::InvalidArgument( "Graph must have kParamScopeAttr attribute.")); - VLOG(1) << "Handle delete weight dequant linear op pass ..."; + VLOG(3) << "Handle delete weight dequant linear op pass ..."; auto& scope = graph->Get(kParamScopeAttr); bool is_int8 = false; @@ -44,10 +44,10 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - VLOG(1) << "Dequantize linear op Type: " << op->Type(); if (op->Type() == "dequantize_linear") { - VLOG(1) << "Dequantize linear op is come in: " << op->Type(); - Node *weight_var_node, *calcu_op_node, *while_op_node; + Node* weight_var_node = nullptr; + Node* calcu_op_node = nullptr; + Node* while_op_node = nullptr; Node *dequantized_weight_var_node = nullptr, *scale_var_node = nullptr; // 1. Judge whether for dequant weight and find // weight_var_node/scale_var_node @@ -60,9 +60,12 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { scale_var_node = input_node; } } else { - return; + break; } } + if (weight_var_node == nullptr || scale_var_node == nullptr) { + continue; + } // 2. Find next_op_node // For while op: delete its input which is related to dequantized // For calculation op: set weight scale as their attributes @@ -107,7 +110,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { } } else { PADDLE_THROW(platform::errors::Unimplemented( - "The dtype of quantization scale must be FP32/16, " + "The dtype of quantization scale must be FP32/FP16, " "but received %d, which is not supported.", weight_scale_tensor->dtype())); } @@ -147,13 +150,12 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { weight_scale_nums)); calcu_op_desc->SetAttr("weight_scale", weight_scale); } - calcu_op_desc->SetAttr("weight_quant_axis", quant_axis); - calcu_op_desc->SetAttr("weight_bit_length", bit_length); - calcu_op_desc->SetAttr("enable_int8", true); - VLOG(1) << "dequantized_weight_var_node->Var()->Name():" - << dequantized_weight_var_node->Var()->Name(); - VLOG(1) << "weight_var_node->Var()->Name(): " - << weight_var_node->Var()->Name(); + if (bit_length == 8) { + // Current 8-bit quantization only supports int8 + calcu_op_desc->SetAttr("op_weights_precision", + std::string("int8")); + } + calcu_op_desc->RenameInput( dequantized_weight_var_node->Var()->Name(), weight_var_node->Var()->Name()); diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 3596f4e0f0e29..e29e5a2a9a9d2 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -177,7 +177,11 @@ class Graph { platform::errors::AlreadyExists( "The attribute %s to be set already exists in the graph.", attr_name)); + VLOG(1) << "set attribute " << attr_name; attrs_[attr_name] = attr; + VLOG(1) << "attrs_ size " << attrs_.size(); + std::vector attr_names = AttrNames(); + VLOG(1) << "attr_names size " << attr_names.size(); attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; delete attr; @@ -412,6 +416,23 @@ class Graph { return sub_graphs_.size(); } + std::vector AttrNames() const { + VLOG(1) << "graph addr:" << this; + if (FLAGS_convert_all_blocks) { + if (IsMainGraph()) { + return GetSubGraph(0)->AttrNames(); + } + } + std::vector res; + res.reserve(attrs_.size()); + VLOG(1) << "AttrNames attr size: " << attrs_.size(); + for (auto &attr : attrs_) { + res.push_back(attr.first); + VLOG(1) << "AttrNames: " << attr.first; + } + return res; + } + private: // TODO(levi): delete this interface after when we can convert all // blocks into sub_graphs. diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h index ce97cdd5fee33..d6c54cef47d90 100644 --- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h +++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h @@ -17,12 +17,13 @@ #include #include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" namespace paddle { namespace framework { namespace ir { -static void SaveInfoInTheTmpOp( +static inline void SaveInfoInTheTmpOp( ir::Graph* graph, const std::string& flag, const std::string& key_suffix, @@ -40,6 +41,20 @@ static void SaveInfoInTheTmpOp( } } +static inline void SaveQuantInfoInTheGraph( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const std::unordered_map>& info_map) { + VLOG(1) << "Save quant info in the graph!"; + const std::string suffix = "_" + key_suffix + "_" + flag; + graph->Set(flag, new bool(true)); + for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { + VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix; + graph->Set(iter->first + suffix, new std::vector(iter->second)); + } +} + static void GetInfoFromTheTmpOp( ir::Graph* graph, const std::string& flag, @@ -51,34 +66,104 @@ static void GetInfoFromTheTmpOp( for (auto* op_node : ir::TopologyVarientSort(*graph, static_cast(0))) { if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue; - VLOG(5) << "Come in save op"; + VLOG(1) << "Come in save op"; auto* op_desc = op_node->Op(); if (op_desc->GetAttrIfExists(flag)) { - VLOG(5) << "flag is true"; + VLOG(1) << "flag is true"; op_desc->RemoveAttr(flag); std::vector attr_names = op_desc->AttrNames(); + VLOG(1) << "attr_names size:" << attr_names.size(); for (auto fake_name : attr_names) { - VLOG(5) << "fake_name:" << fake_name; + VLOG(1) << "fake_name:" << fake_name; size_t pos = fake_name.find(suffix); if (pos != std::string::npos) { std::string name = fake_name.substr(0, pos); - VLOG(5) << "name:" << name; + VLOG(1) << "name:" << name; auto scales_vector = PADDLE_GET_CONST(std::vector, op_desc->GetAttr(fake_name)); - VLOG(5) << "scales_vector:" << scales_vector[0]; + VLOG(1) << "scales_vector:" << scales_vector[0]; info_map->insert(std::make_pair(name, scales_vector)); - VLOG(5) << "insert success:"; + VLOG(1) << "insert success:"; op_desc->RemoveAttr(fake_name); - VLOG(5) << "remove success:"; + VLOG(1) << "remove success:"; } } graph->RemoveNode(op_node); - VLOG(5) << "remove op node success:"; + VLOG(1) << "remove op node success:"; break; } } } +static inline void GetQuantInfoFromTheGraph( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + std::unordered_map>* info_map) { + VLOG(1) << "Get quant info from the graph attrs!"; + const std::string suffix = "_" + key_suffix + "_" + flag; + VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0); + if (graph->Has(flag)) { + std::vector attr_names = graph->AttrNames(); + VLOG(1) << "attr_names size:" << attr_names.size(); + for (auto fake_name : attr_names) { + VLOG(1) << "fake_name:" << fake_name; + size_t pos = fake_name.find(suffix); + if (pos != std::string::npos) { + std::string name = fake_name.substr(0, pos); + VLOG(1) << "name:" << name; + auto scales_vector = graph->Get>(fake_name); + VLOG(1) << "scales_vector:" << scales_vector[0]; + info_map->insert(std::make_pair(name, scales_vector)); + } + } + } +} + +static inline std::unordered_map> +GetQuantInfoFromTheGraph(ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix) { + std::unordered_map> info_map; + VLOG(1) << "Get quant info from the graph attrs!"; + const std::string suffix = "_" + key_suffix + "_" + flag; + VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0); + if (graph->Has(flag)) { + std::vector attr_names = graph->AttrNames(); + VLOG(1) << "attr_names size:" << attr_names.size(); + for (auto fake_name : attr_names) { + VLOG(1) << "fake_name:" << fake_name; + size_t pos = fake_name.find(suffix); + if (pos != std::string::npos) { + std::string name = fake_name.substr(0, pos); + VLOG(1) << "name:" << name; + auto scales_vector = graph->Get>(fake_name); + VLOG(1) << "scales_vector:" << scales_vector[0]; + info_map.insert(std::make_pair(name, scales_vector)); + } + } + } + return info_map; +} + +static inline bool AreScalesPresentForNodes( + std::unordered_map>* var_quant_scales, + std::initializer_list nodes) { + bool present = true; + for (auto node : nodes) { + if (var_quant_scales->count(node->Name()) == 0) { + present = false; + } + } + return present; +} + +static inline float GetScaleValueForNode( + std::unordered_map>* var_quant_scales, + Node* node) { + return var_quant_scales->at(node->Name())[0]; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 19e006d535409..0605f2355ce2b 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -370,7 +371,7 @@ class Conv2dXPUFusePass : public FusePassBase { bool with_conv_bias, bool with_bn, bool with_scale, - bool enable_int8) const; + std::string op_weights_precision) const; void CreateFusionInputs( ir::Graph* graph, @@ -378,7 +379,9 @@ class Conv2dXPUFusePass : public FusePassBase { BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; void CreateFusionBranch( ir::Graph* graph, @@ -386,7 +389,9 @@ class Conv2dXPUFusePass : public FusePassBase { BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; void CreateFusionOutputs( ir::Graph* graph, @@ -394,8 +399,10 @@ class Conv2dXPUFusePass : public FusePassBase { BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8, - std::string act_type) const; + std::string op_weights_precision, + std::string act_type, + std::unordered_map>* var_quant_scales) + const; const std::unordered_set support_quant_op_type_{"conv2d", "conv2d_xpu"}; @@ -475,7 +482,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( bool with_conv_bias, bool with_bn, bool with_scale, - bool enable_int8) const { + std::string op_weights_precision) const { // Get Node auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); PADDLE_ENFORCE_EQ( @@ -589,7 +596,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); } // recompute the weights - if (!enable_int8) { + if (op_weights_precision != "int8") { float* filter_ptr = filter_t->mutable_data(paddle::platform::CPUPlace()); for (int i = 0; i < mean_len; ++i) { @@ -631,7 +638,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( } // deal with scale op - if (with_scale && !enable_int8) { + if (with_scale) { auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale"); PADDLE_ENFORCE_EQ( scale != nullptr, @@ -657,10 +664,16 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( } } // recompute weight as scale op - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); - for (int i = 0; i < filter_len; ++i) { - filter_ptr[i] *= scale_val_; + if (op_weights_precision != "int8") { + float* filter_ptr = + filter_t->mutable_data(paddle::platform::CPUPlace()); + for (int i = 0; i < filter_len; ++i) { + filter_ptr[i] *= scale_val_; + } + } else { + for (int i = 0; i < weight_scale.size(); i++) { + weight_scale[i] *= scale_val_; + } } } @@ -669,7 +682,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( Node* filter_intx = nullptr; Node* filter_max = nullptr; Node* scale_max = nullptr; - if (!enable_int8) { + if (op_weights_precision != "int8") { PrepareWeight(graph, scope, block, @@ -735,7 +748,9 @@ void Conv2dXPUFusePass::CreateFusionInputs( BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { // Get Node auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); PADDLE_ENFORCE_EQ( @@ -750,9 +765,13 @@ void Conv2dXPUFusePass::CreateFusionInputs( // input max std::string conv_input_max_name = input->Name() + "_input_max"; Node* conv2d_xpu_input_max = nullptr; - if (enable_int8) { - float input_scale = - conv->Op()->GetAttrIfExists("Input_scale_" + input->Name()); + if (op_weights_precision == "int8") { + PADDLE_ENFORCE_EQ(AreScalesPresentForNodes(var_quant_scales, {input}), + true, + platform::errors::InvalidArgument( + "When conv op is running in int8 precision, the " + "scales of input var should be present in!")); + float input_scale = GetScaleValueForNode(var_quant_scales, input); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc conv_input_max_desc(conv_input_max_name); conv_input_max_desc.SetPersistable( @@ -782,7 +801,9 @@ void Conv2dXPUFusePass::CreateFusionBranch( BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { // Get Node auto* ew_branch_add = GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add"); @@ -798,7 +819,7 @@ void Conv2dXPUFusePass::CreateFusionBranch( std::string ew_branch_add_max_name = ew_branch_add_in->Name() + "branch_max"; Node* ew_branch_add_max = FindNodeWithName(graph, ew_branch_add_max_name); - if (enable_int8 && !ew_branch_add_max) { + if (op_weights_precision == "int8" && !ew_branch_add_max) { int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name); ew_branch_add_in_max_desc.SetPersistable( @@ -808,15 +829,19 @@ void Conv2dXPUFusePass::CreateFusionBranch( ew_branch_add_in_max_desc.SetDataType( proto::VarType::Type::VarType_Type_FP32); ew_branch_add_max = graph->CreateVarNode(&ew_branch_add_in_max_desc); - float ew_branch_add_scale = ew_branch_add->Op()->GetAttrIfExists( - "Input_scale_" + ew_branch_add_in->Name()); + PADDLE_ENFORCE_EQ( + AreScalesPresentForNodes(var_quant_scales, {ew_branch_add_in}), + true, + platform::errors::InvalidArgument( + "When conv op is running in int8 precision with branch add, the " + "scales of branch var should be present in!")); + float ew_branch_add_scale = + GetScaleValueForNode(var_quant_scales, ew_branch_add_in); auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); PADDLE_ENFORCE_EQ( conv != nullptr, true, platform::errors::InvalidArgument("conv node ptr can not be null")); - conv->Op()->SetAttr("Input_scale_" + ew_branch_add_in->Name(), - ew_branch_add_scale); auto ew_branch_add_max_tensor = scope->Var(ew_branch_add_max_name)->GetMutable(); ew_branch_add_max_tensor->set_type(phi::DataType::FLOAT32); @@ -839,8 +864,10 @@ void Conv2dXPUFusePass::CreateFusionOutputs( BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8, - std::string act_type) const { + std::string op_weights_precision, + std::string act_type, + std::unordered_map>* var_quant_scales) + const { auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); PADDLE_ENFORCE_EQ( conv != nullptr, @@ -921,7 +948,8 @@ void Conv2dXPUFusePass::CreateFusionOutputs( (*fusion_nodes_map)["out"] = conv2d_out_var_node; // Create out max in - if (enable_int8) { + if (op_weights_precision == "int8" && + AreScalesPresentForNodes(var_quant_scales, {conv2d_out_var_node})) { std::string conv_out_max_in_name = conv2d_xpu_out_name + "_max_in"; int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc conv_out_max_in_desc(conv_out_max_in_name); @@ -934,25 +962,8 @@ void Conv2dXPUFusePass::CreateFusionOutputs( block_out_max_in_desc->SetShape(conv_out_max_in_desc.GetShape()); block_out_max_in_desc->SetDataType(conv_out_max_in_desc.GetDataType()); - auto GetOutputScale = [&](Node* var_node, std::string name) -> float { - int nums_any_ops = var_node->outputs.size(); - for (size_t i = 0; i < nums_any_ops; ++i) { - auto* any_op_desc = conv2d_out_var_node->outputs[i]->Op(); - VLOG(1) << "any_op_desc: " << any_op_desc->Type(); - if (any_op_desc->HasAttr("Input_scale_" + name)) { - VLOG(1) << "find it: " - << "Input_scale_" + name; - return any_op_desc->GetAttrIfExists("Input_scale_" + name); - } - } - return 0; - }; float output_scale = - GetOutputScale(conv2d_out_var_node, conv2d_xpu_out_name); - conv->Op()->SetAttr("Input_scale_" + conv2d_xpu_out_name, output_scale); - VLOG(1) << "conv2d_xpu_out_name:" << conv2d_xpu_out_name - << " output_scale: " << output_scale - << "conv2d_out_var_node name:" << conv2d_out_var_node->Name(); + GetScaleValueForNode(var_quant_scales, conv2d_out_var_node); phi::DenseTensor out_max_in_cpu_tensor; auto* cpu_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); @@ -995,7 +1006,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, auto* scope = param_scope(); PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument("Scope cannot be nullptr.")); - + std::unordered_map> var_quant_scales = + GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -1066,9 +1078,12 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, {"out", nullptr}, {"out_max", nullptr}}; - bool enable_int8 = conv->Op()->GetAttrIfExists("enable_int8"); - std::string op_precision_str = enable_int8 ? "int8" : "fp32"; - VLOG(4) << "Conv2d fusion fuse pass is running on " << op_precision_str + std::string op_weights_precision = "float32"; + if (conv->Op()->HasAttr("op_weights_precision")) { + op_weights_precision = + conv->Op()->GetAttrIfExists("op_weights_precision"); + } + VLOG(4) << "Conv2d fusion fuse pass is running on " << op_weights_precision << " precision!"; auto* block = conv->Op()->Block(); CreateFusionWeightsAndBias(graph, @@ -1079,45 +1094,31 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, with_conv_bias, with_bn, with_scale, - enable_int8); - VLOG(1) << "CreateFusionWeightsAndBias success!"; - CreateFusionInputs( - graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); - VLOG(1) << "CreateFusionInputs success!"; - CreateFusionBranch( - graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); - VLOG(1) << "CreateFusionBranch success!"; + op_weights_precision); + CreateFusionInputs(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + op_weights_precision, + &var_quant_scales); + CreateFusionBranch(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + op_weights_precision, + &var_quant_scales); CreateFusionOutputs(graph, scope, block, nodes_map, &fusion_nodes_map, - enable_int8, - act_type); - VLOG(1) << "CreateFusionOutputs success!"; - // int out_dtype = PADDLE_GET_CONST(int, conv->Op()->GetAttr("out_dtype")); - // if (out_dtype == proto::VarType::Type::VarType_Type_INT8) { - // fusion_nodes_map["out"]->Var()->SetDataType( - // proto::VarType::Type::VarType_Type_INT8); - // if (fusion_nodes_map["branch"]) { - // fusion_nodes_map["branch"]->Var()->SetDataType( - // proto::VarType::Type::VarType_Type_INT8); - // } - // } - // Generate conv2d_xpu op + op_weights_precision, + act_type, + &var_quant_scales); + framework::OpDesc conv2d_xpu_op_desc(block); - for (auto [first, second] : fusion_nodes_map) { - VLOG(1) << "first: " << first << " second: " << second; - if (first == "x" || first == "out" || first == "out_max" || - first == "branch") - continue; - if (second != nullptr) { - auto* temp_tensor = - scope->FindVar(second->Name())->GetMutable(); - VLOG(1) << *temp_tensor; - } - } - // set input&output var conv2d_xpu_op_desc.SetType("conv2d_xpu"); conv2d_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()}); if (fusion_nodes_map["x_max"]) { @@ -1159,7 +1160,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, {fusion_nodes_map["branch_max"]->Name()}); } } - VLOG(1) << "creat conv2d_xpu_op_desc success!"; // set attrs of conv2d_xpu float act_param_ = 0.0f; if (!act_type.empty()) { @@ -1199,26 +1199,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, // out_dtype is same to input precision conv2d_xpu_op_desc.SetAttr("out_dtype", fusion_nodes_map["x"]->Var()->GetDataType()); - conv2d_xpu_op_desc.SetAttr( - "enable_int8", conv->Op()->GetAttrIfExists("enable_int8")); - if (enable_int8) { - conv2d_xpu_op_desc.SetAttr( - "Input_scale_" + fusion_nodes_map["out"]->Name(), - conv->Op()->GetAttrIfExists("Input_scale_" + - fusion_nodes_map["out"]->Name())); - conv2d_xpu_op_desc.SetAttr( - "Input_scale_" + fusion_nodes_map["x"]->Name(), - conv->Op()->GetAttrIfExists("Input_scale_" + - fusion_nodes_map["x"]->Name())); - if (fusion_nodes_map["branch"]) { - conv2d_xpu_op_desc.SetAttr( - "Input_scale_" + fusion_nodes_map["branch"]->Name(), - conv->Op()->GetAttrIfExists( - "Input_scale_" + fusion_nodes_map["branch"]->Name())); - } - } + conv2d_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision); - VLOG(1) << "Set attr success!"; // Link node auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc); IR_NODE_LINK_TO(fusion_nodes_map["x"], conv2d_xpu); diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index f087b7caf20ab..cb007fd435178 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -253,7 +254,7 @@ class FcXPUFusePass : public FusePassBase { std::map* fusion_nodes_map, bool with_bias, bool with_bn, - bool enable_int8) const; + std::string op_weights_precision) const; void CreateFusionOutputs( ir::Graph* graph, @@ -261,7 +262,9 @@ class FcXPUFusePass : public FusePassBase { BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; void CreateFusionInputs( ir::Graph* graph, @@ -269,7 +272,9 @@ class FcXPUFusePass : public FusePassBase { BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; Node* GetNodeFromNodesMap( const std::map>& nodes_map, @@ -337,7 +342,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( std::map* fusion_nodes_map, bool with_bias, bool with_bn, - bool enable_int8) const { + std::string op_weights_precision) const { // Get Node auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); PADDLE_ENFORCE_EQ( @@ -449,7 +454,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); } // recompute the weights - if (!enable_int8) { + if (op_weights_precision != "int8") { float* filter_ptr = filter_t->mutable_data(paddle::platform::CPUPlace()); for (int i = 0; i < mean_len; ++i) { @@ -495,7 +500,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( Node* filter_intx = nullptr; Node* filter_max = nullptr; Node* scale_max = nullptr; - if (!enable_int8) { + if (op_weights_precision != "int8") { PrepareWeight(graph, scope, block, @@ -561,7 +566,9 @@ void FcXPUFusePass::CreateFusionOutputs( BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); PADDLE_ENFORCE_EQ( mul != nullptr, @@ -617,7 +624,8 @@ void FcXPUFusePass::CreateFusionOutputs( (*fusion_nodes_map)["out"] = fc_out_var_node; // Create out max in - if (enable_int8) { + if (op_weights_precision == "int8" && + AreScalesPresentForNodes(var_quant_scales, {fc_out_var_node})) { std::string fc_out_max_in_name = fc_xpu_out_name + "_max_in"; int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc fc_out_max_in_desc(fc_out_max_in_name); @@ -630,24 +638,8 @@ void FcXPUFusePass::CreateFusionOutputs( block_out_max_in_desc->SetShape(fc_out_max_in_desc.GetShape()); block_out_max_in_desc->SetDataType(fc_out_max_in_desc.GetDataType()); - auto GetOutputScale = [&](Node* var_node, std::string name) -> float { - int nums_any_ops = var_node->outputs.size(); - for (size_t i = 0; i < nums_any_ops; ++i) { - auto* any_op_desc = fc_out_var_node->outputs[i]->Op(); - VLOG(1) << "any_op_desc: " << any_op_desc->Type(); - if (any_op_desc->HasAttr("Input_scale_" + name)) { - VLOG(1) << "find it: " - << "Input_scale_" + name; - return any_op_desc->GetAttrIfExists("Input_scale_" + name); - } - } - return 0; - }; - float output_scale = GetOutputScale(fc_out_var_node, fc_xpu_out_name); - mul->Op()->SetAttr("Input_scale_" + fc_xpu_out_name, output_scale); - VLOG(1) << "fc_xpu_out_name:" << fc_xpu_out_name - << " output_scale: " << output_scale - << "fc_out_var_node name:" << fc_out_var_node->Name(); + float output_scale = + GetScaleValueForNode(var_quant_scales, fc_out_var_node); phi::DenseTensor out_max_in_cpu_tensor; auto* cpu_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); @@ -675,7 +667,9 @@ void FcXPUFusePass::CreateFusionInputs( BlockDesc* block, const std::map>& nodes_map, std::map* fusion_nodes_map, - bool enable_int8) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { // Get Node auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); PADDLE_ENFORCE_EQ( @@ -690,9 +684,13 @@ void FcXPUFusePass::CreateFusionInputs( // x max std::string mul_x_max_name = mul_x->Name() + "_max"; Node* mul_x_max = nullptr; - if (enable_int8) { - float input_scale = - mul->Op()->GetAttrIfExists("Input_scale_" + mul_x->Name()); + if (op_weights_precision == "int8") { + PADDLE_ENFORCE_EQ(AreScalesPresentForNodes(var_quant_scales, {mul_x}), + true, + platform::errors::InvalidArgument( + "When fc op is running in int8 precision, the scales " + "of input var should be present in!")); + float input_scale = GetScaleValueForNode(var_quant_scales, mul_x); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc x_max_desc(mul_x_max_name); x_max_desc.SetPersistable( @@ -729,6 +727,8 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, with_bn, act_type); auto* scope = param_scope(); + std::unordered_map> var_quant_scales = + GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -784,10 +784,12 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, {"out_max_in", nullptr}, {"out", nullptr}, {"out_max", nullptr}}; - - bool enable_int8 = mul->Op()->GetAttrIfExists("enable_int8"); - std::string op_precision_str = enable_int8 ? "int8" : "fp32"; - VLOG(1) << "FC fusion fuse pass is running on " << op_precision_str + std::string op_weights_precision = "float32"; + if (mul->Op()->HasAttr("op_weights_precision")) { + op_weights_precision = + mul->Op()->GetAttrIfExists("op_weights_precision"); + } + VLOG(4) << "FC fusion fuse pass is running on " << op_weights_precision << " precision!"; auto* block = mul->Op()->Block(); CreateFusionWeightsAndBias(graph, @@ -798,12 +800,21 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, &fusion_nodes_map, with_bias, with_bn, - enable_int8); - CreateFusionInputs( - graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); - CreateFusionOutputs( - graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8); - VLOG(1) << "CreateFusionOutputs success!"; + op_weights_precision); + CreateFusionInputs(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + op_weights_precision, + &var_quant_scales); + CreateFusionOutputs(graph, + scope, + block, + nodes_map, + &fusion_nodes_map, + op_weights_precision, + &var_quant_scales); // Generate fc_xpu op framework::OpDesc fc_xpu_op_desc(block); @@ -854,22 +865,10 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope"))); } } + fc_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision); // out_dtype is same to input precision fc_xpu_op_desc.SetAttr("out_dtype", fusion_nodes_map["x"]->Var()->GetDataType()); - fc_xpu_op_desc.SetAttr("enable_int8", - mul->Op()->GetAttrIfExists("enable_int8")); - if (enable_int8) { - fc_xpu_op_desc.SetAttr( - "Input_scale_" + fusion_nodes_map["out"]->Name(), - mul->Op()->GetAttrIfExists("Input_scale_" + - fusion_nodes_map["out"]->Name())); - fc_xpu_op_desc.SetAttr( - "Input_scale_" + fusion_nodes_map["x"]->Name(), - mul->Op()->GetAttrIfExists("Input_scale_" + - fusion_nodes_map["x"]->Name())); - } - auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc); IR_NODE_LINK_TO(fusion_nodes_map["x"], fc_xpu); if (fusion_nodes_map["x_max"]) { diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index d9ab5448d0fda..9fa5893b81666 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -80,9 +80,10 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern, auto* fusion_op = pattern->NewNode(fusion_op_repr()) ->assert_is_op("conv2d_xpu") ->assert_more([&](Node* node) { - bool enable_int8 = - node->Op()->GetAttrIfExists("enable_int8"); - return !enable_int8; + std::string op_weights_precision = + node->Op()->GetAttrIfExists( + "op_weights_precision"); + return op_weights_precision != "int8"; }); auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x"); @@ -109,9 +110,10 @@ LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope) auto* fusion_op = pattern->NewNode(fusion_op_repr()) ->assert_is_op("fc_xpu") ->assert_more([&](Node* node) { - bool enable_int8 = - node->Op()->GetAttrIfExists("enable_int8"); - return !enable_int8; + std::string op_weights_precision = + node->Op()->GetAttrIfExists( + "op_weights_precision"); + return op_weights_precision != "int8"; }); auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x"); diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc index fff3c4020b544..a06319250a9cb 100644 --- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc @@ -286,36 +286,13 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const { desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x")); desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y")); desc.SetAttr("alpha", 1.0f); - if (matmul_v2->Op()->HasAttr("enable_int8")) { - desc.SetAttr("enable_int8", matmul_v2->Op()->GetAttr("enable_int8")); - } - if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_x->Name())) { - desc.SetAttr("Input_scale_" + matmul_x->Name(), - matmul_v2->Op()->GetAttr("Input_scale_" + matmul_x->Name())); - } - if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_y->Name())) { - desc.SetAttr("Input_scale_" + matmul_y->Name(), - matmul_v2->Op()->GetAttr("Input_scale_" + matmul_y->Name())); - } - if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_out->Name())) { - desc.SetAttr( - "Input_scale_" + matmul_out->Name(), - matmul_v2->Op()->GetAttr("Input_scale_" + matmul_out->Name())); + if (matmul_v2->Op()->HasAttr("op_weights_precision")) { + desc.SetAttr("op_weights_precision", + matmul_v2->Op()->GetAttr("op_weights_precision")); } if (matmul_v2->Op()->HasAttr("weight_scale")) { desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale")); } - if (matmul_v2->Op()->HasAttr("weight_bit_length")) { - desc.SetAttr("weight_bit_length", - matmul_v2->Op()->GetAttr("weight_bit_length")); - } - if (matmul_v2->Op()->HasAttr("weight_quant_axis")) { - desc.SetAttr("weight_quant_axis", - matmul_v2->Op()->GetAttr("weight_quant_axis")); - } - if (matmul_v2->Op()->HasAttr("use_mkldnn")) { - desc.SetAttr("use_mkldnn", matmul_v2->Op()->GetAttr("use_mkldnn")); - } auto matmul_node = graph->CreateOpNode(&desc); IR_NODE_LINK_TO(matmul_x, matmul_node); IR_NODE_LINK_TO(matmul_y, matmul_node); diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index a8fc7102a8d88..0c2ab8e67f0d0 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -32,29 +32,34 @@ static void UnlinkNodes(ir::Node* a, ir::Node* b) { b->inputs.end()); } +static void MarkAndLogCannotQuantizeOp(Node* op, + const char* details = nullptr) { + std::stringstream msg_ss; + msg_ss << "Cannot quantize operator " << op->Name() + << " (type: " << op->Op()->Type() << ", id: " << op->id() << ")."; + if (details) msg_ss << " " << details; + VLOG(2) << msg_ss.str().c_str(); + op->Op()->SetAttr("xpu_op_calc_data_type", std::string("float32")); +} void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const { - GetInfoFromTheTmpOp( - graph, - "has_quant_info", - "var_quant_scales", - const_cast>*>( - &var_quant_scales_)); + var_quant_scales_ = + GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); } -bool XPUQuantizeOpPass::AreScalesPresentForNodes( - std::initializer_list nodes) const { - bool present = true; - for (auto node : nodes) { - if (var_quant_scales_.count(node->Name()) == 0) { - present = false; - } - } - return present; -} +// bool XPUQuantizeOpPass::AreScalesPresentForNodes( +// std::initializer_list nodes) const { +// bool present = true; +// for (auto node : nodes) { +// if (var_quant_scales_.count(node->Name()) == 0) { +// present = false; +// } +// } +// return present; +// } -float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const { - return var_quant_scales_.at(node->Name())[0]; -} +// float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const { +// return var_quant_scales_.at(node->Name())[0]; +// } void XPUQuantizeOpPass::QuantizeInput(Graph* g, Node* op, @@ -78,7 +83,7 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, proto::VarType::Type::VarType_Type_INT8); // Create quantize max_ptr node - float scale = GetScaleValueForNode(input); + float scale = GetScaleValueForNode(&var_quant_scales_, input); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); std::string input_max_name = input->Name() + "_quantize_max"; VarDesc input_max_desc(input_max_name); @@ -144,7 +149,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, proto::VarType::Type::VarType_Type_INT8); // Create dequantize max_ptr node - float scale = GetScaleValueForNode(output); + float scale = GetScaleValueForNode(&var_quant_scales_, output); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); std::string input_max_name = output->Name() + "_dequantize_max"; VarDesc input_max_desc(input_max_name); @@ -221,16 +226,15 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { out_var_node = output_node; } } - if (!AreScalesPresentForNodes({x_var_node})) { - // MarkAndLogCannotQuantizeOp(conv_op, - // "No scale available for the operator"); + if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) { + MarkAndLogCannotQuantizeOp(n, "No scale available for the operator"); return; } QuantizeInput(graph, n, x_var_node, "x"); // Branch input if (branch_var_node != nullptr) { - if (AreScalesPresentForNodes({branch_var_node})) { + if (AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node})) { QuantizeInput(graph, n, branch_var_node, "branch"); } else { n->Op()->SetAttr("xpu_op_force_output_precision", @@ -238,7 +242,61 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { } } - auto has_output_scale = AreScalesPresentForNodes({out_var_node}); + auto has_output_scale = + AreScalesPresentForNodes(&var_quant_scales_, {out_var_node}); + if (has_output_scale) { + DequantizeOutput(graph, n, out_var_node, "out"); + n->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + } else { + n->Op()->SetAttr("xpu_op_force_output_precision", + x_var_node->Var()->GetDataType()); + n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); + } + } + } +} + +void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const { + for (auto* n : graph->Nodes()) { + if (n->IsOp()) { + auto* op = n->Op(); + if (op->Type() != "fc_xpu") { + continue; + } + Node* w_var_node = nullptr; + Node* x_var_node = nullptr; + Node* out_var_node = nullptr; + + for (auto* input_node : n->inputs) { + if (!input_node->IsVar()) { + continue; + } + if (input_node->Var()->Name() == op->Input("x")[0]) { + x_var_node = input_node; + } else if (input_node->Var()->Name() == op->Input("w")[0]) { + w_var_node = input_node; + } + } + + for (auto* output_node : n->outputs) { + if (!output_node->IsVar()) { + continue; + } + if (output_node->Var()->Name() == op->Output("out")[0]) { + out_var_node = output_node; + } + } + if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) { + MarkAndLogCannotQuantizeOp(n, "No scale available for the operator"); + return; + } + + QuantizeInput(graph, n, x_var_node, "x"); + + auto has_output_scale = + AreScalesPresentForNodes(&var_quant_scales_, {out_var_node}); if (has_output_scale) { DequantizeOutput(graph, n, out_var_node, "out"); n->Op()->SetAttr( @@ -266,6 +324,8 @@ void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const { VLOG(1) << "Get quant info from graph success."; QuantizeConv(graph); VLOG(1) << "Quantize conv of the graph success."; + QuantizeFC(graph); + VLOG(1) << "Quantize fc of the graph success."; } } // namespace ir diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h index 0b74682009351..1deb4bebe0dc7 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h @@ -38,6 +38,7 @@ class XPUQuantizeOpPass : public FusePassBase { protected: void ApplyImpl(Graph* graph) const override; void QuantizeConv(Graph* graph) const; + void QuantizeFC(Graph* graph) const; private: void QuantizeInput(Graph* g, @@ -52,11 +53,11 @@ class XPUQuantizeOpPass : public FusePassBase { void GetQuantInfo(Graph* graph) const; - bool AreScalesPresentForNodes(std::initializer_list nodes) const; + // bool AreScalesPresentForNodes(std::initializer_list nodes) const; - float GetScaleValueForNode(Node* node) const; + // float GetScaleValueForNode(Node* node) const; - std::unordered_map> var_quant_scales_; + mutable std::unordered_map> var_quant_scales_; const std::string name_scope_{"xpu_quantize_op_pass"}; }; diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index ef352712102c4..c1eeb4c1036bd 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -17,7 +17,6 @@ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h" #include "paddle/fluid/framework/ir/constant_folding_pass.h" -#include "paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/identity_op_clean_pass.h" @@ -92,9 +91,6 @@ void ConvertToMixedPrecisionPass::Run() { LoadModel(); if (backend_ == phi::Backend::XPU) { - framework::ir::DeleteQuantDequantLinearOpPass - delete_quant_dequant_linear_op_pass; - delete_quant_dequant_linear_op_pass.Apply(main_graph_.get()); framework::ir::DeleteWeightDequantLinearOpPass delete_weight_dequant_linear_op_pass; delete_weight_dequant_linear_op_pass.Apply(main_graph_.get()); From fdb14aacc0f6666480002c84c0f562cb680b3187 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Fri, 20 Oct 2023 14:05:51 +0800 Subject: [PATCH 05/15] remove op_weights_precision attr --- .../delete_weight_dequant_linear_op_pass.cc | 11 +-- .../ir/quantize_related_pass_utils.h | 68 +++---------------- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 18 +++-- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 27 +++++--- .../framework/ir/xpu/link_xpu_op_max_pass.cc | 63 ++++++++++------- .../framework/ir/xpu/link_xpu_op_max_pass.h | 1 + .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc | 7 -- 7 files changed, 84 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index 968120068b92a..52b4a8fce8c12 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" #include "glog/logging.h" @@ -40,6 +41,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { bool is_int8 = false; std::unordered_set nodes2rm; + std::unordered_map> var_quant_scales{}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { @@ -150,10 +152,9 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { weight_scale_nums)); calcu_op_desc->SetAttr("weight_scale", weight_scale); } - if (bit_length == 8) { - // Current 8-bit quantization only supports int8 - calcu_op_desc->SetAttr("op_weights_precision", - std::string("int8")); + if (!var_quant_scales.count(weight_var_node->Var()->Name())) { + var_quant_scales.insert(std::make_pair( + weight_var_node->Var()->Name(), weight_scale)); } calcu_op_desc->RenameInput( @@ -180,6 +181,8 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { } GraphSafeRemoveNodes(graph, nodes2rm); + SaveQuantInfoInTheGraph( + graph, "has_quant_info", "var_quant_scales", var_quant_scales); graph->Set("enable_int8", new bool(is_int8)); } } // namespace ir diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h index d6c54cef47d90..f8c6358dfcdb1 100644 --- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h +++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h @@ -23,24 +23,6 @@ namespace paddle { namespace framework { namespace ir { -static inline void SaveInfoInTheTmpOp( - ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix, - const std::unordered_map>& info_map) { - VLOG(3) << "save variables in the first op's attr"; - - const std::string suffix = "_" + key_suffix + "_" + flag; - OpDesc op_desc; - op_desc.SetType("save"); - auto* op_node = graph->CreateOpNode(&op_desc); - - op_node->Op()->SetAttr(flag, true); - for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { - op_node->Op()->SetAttr(iter->first + suffix, iter->second); - } -} - static inline void SaveQuantInfoInTheGraph( ir::Graph* graph, const std::string& flag, @@ -48,53 +30,15 @@ static inline void SaveQuantInfoInTheGraph( const std::unordered_map>& info_map) { VLOG(1) << "Save quant info in the graph!"; const std::string suffix = "_" + key_suffix + "_" + flag; - graph->Set(flag, new bool(true)); + if (!graph->Has(flag)) { + graph->Set(flag, new bool(true)); + } for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix; graph->Set(iter->first + suffix, new std::vector(iter->second)); } } -static void GetInfoFromTheTmpOp( - ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix, - std::unordered_map>* info_map) { - VLOG(3) << "get variables from the first op's attr"; - - const std::string suffix = "_" + key_suffix + "_" + flag; - for (auto* op_node : - ir::TopologyVarientSort(*graph, static_cast(0))) { - if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue; - VLOG(1) << "Come in save op"; - auto* op_desc = op_node->Op(); - if (op_desc->GetAttrIfExists(flag)) { - VLOG(1) << "flag is true"; - op_desc->RemoveAttr(flag); - std::vector attr_names = op_desc->AttrNames(); - VLOG(1) << "attr_names size:" << attr_names.size(); - for (auto fake_name : attr_names) { - VLOG(1) << "fake_name:" << fake_name; - size_t pos = fake_name.find(suffix); - if (pos != std::string::npos) { - std::string name = fake_name.substr(0, pos); - VLOG(1) << "name:" << name; - auto scales_vector = - PADDLE_GET_CONST(std::vector, op_desc->GetAttr(fake_name)); - VLOG(1) << "scales_vector:" << scales_vector[0]; - info_map->insert(std::make_pair(name, scales_vector)); - VLOG(1) << "insert success:"; - op_desc->RemoveAttr(fake_name); - VLOG(1) << "remove success:"; - } - } - graph->RemoveNode(op_node); - VLOG(1) << "remove op node success:"; - break; - } - } -} - static inline void GetQuantInfoFromTheGraph( ir::Graph* graph, const std::string& flag, @@ -164,6 +108,12 @@ static inline float GetScaleValueForNode( return var_quant_scales->at(node->Name())[0]; } +static inline std::vector GetScaleVecValueForNode( + std::unordered_map>* var_quant_scales, + Node* node) { + return var_quant_scales->at(node->Name()); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 0605f2355ce2b..b85ff63131e0b 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -371,7 +371,9 @@ class Conv2dXPUFusePass : public FusePassBase { bool with_conv_bias, bool with_bn, bool with_scale, - std::string op_weights_precision) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; void CreateFusionInputs( ir::Graph* graph, @@ -482,7 +484,9 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( bool with_conv_bias, bool with_bn, bool with_scale, - std::string op_weights_precision) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { // Get Node auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv"); PADDLE_ENFORCE_EQ( @@ -505,8 +509,10 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( } // Get Weight scale in int8 scene - std::vector weight_scale = - conv->Op()->GetAttrIfExists>("weight_scale"); + std::vector weight_scale{}; + if (AreScalesPresentForNodes(var_quant_scales, {conv_filter})) { + weight_scale = GetScaleVecValueForNode(var_quant_scales, conv_filter); + } // Create fusion_bias_node auto filter_dims = filter_t->dims(); bool has_bias = with_bn || with_conv_bias; @@ -1094,7 +1100,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, with_conv_bias, with_bn, with_scale, - op_weights_precision); + op_weights_precision, + &var_quant_scales); CreateFusionInputs(graph, scope, block, @@ -1199,7 +1206,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, // out_dtype is same to input precision conv2d_xpu_op_desc.SetAttr("out_dtype", fusion_nodes_map["x"]->Var()->GetDataType()); - conv2d_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision); // Link node auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc); diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index cb007fd435178..4e8a6d9d99c73 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -254,7 +254,9 @@ class FcXPUFusePass : public FusePassBase { std::map* fusion_nodes_map, bool with_bias, bool with_bn, - std::string op_weights_precision) const; + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const; void CreateFusionOutputs( ir::Graph* graph, @@ -342,7 +344,9 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( std::map* fusion_nodes_map, bool with_bias, bool with_bn, - std::string op_weights_precision) const { + std::string op_weights_precision, + std::unordered_map>* var_quant_scales) + const { // Get Node auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul"); PADDLE_ENFORCE_EQ( @@ -371,8 +375,10 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y")); } // Get Weight scale in int8 scene - std::vector weight_scale = - mul->Op()->GetAttrIfExists>("weight_scale"); + std::vector weight_scale{}; + if (AreScalesPresentForNodes(var_quant_scales, {mul_w})) { + weight_scale = GetScaleVecValueForNode(var_quant_scales, mul_w); + } // Create fusion_bias_node auto filter_dims = filter_t->dims(); bool has_bias = with_bn || with_bias; @@ -784,10 +790,13 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, {"out_max_in", nullptr}, {"out", nullptr}, {"out_max", nullptr}}; + auto filter_data_type = + scope->FindVar(mul_w->Name())->GetMutable()->dtype(); std::string op_weights_precision = "float32"; - if (mul->Op()->HasAttr("op_weights_precision")) { - op_weights_precision = - mul->Op()->GetAttrIfExists("op_weights_precision"); + if (filter_data_type == phi::DataType::INT8) { + op_weights_precision = "int8"; + } else if (filter_data_type == phi::DataType::FLOAT16) { + op_weights_precision = "float16"; } VLOG(4) << "FC fusion fuse pass is running on " << op_weights_precision << " precision!"; @@ -800,7 +809,8 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, &fusion_nodes_map, with_bias, with_bn, - op_weights_precision); + op_weights_precision, + &var_quant_scales); CreateFusionInputs(graph, scope, block, @@ -865,7 +875,6 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph, "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope"))); } } - fc_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision); // out_dtype is same to input precision fc_xpu_op_desc.SetAttr("out_dtype", fusion_nodes_map["x"]->Var()->GetDataType()); diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index 9fa5893b81666..bf03a2598726c 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -67,6 +67,7 @@ struct LinkConv2dPattern : public PatternBase { PATTERN_DECL_NODE(fusion_op); // declare variable node's name PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(filter); PATTERN_DECL_NODE(branch); private: @@ -77,23 +78,21 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern, const std::string& name_scope, bool with_branch) : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) { - auto* fusion_op = pattern->NewNode(fusion_op_repr()) - ->assert_is_op("conv2d_xpu") - ->assert_more([&](Node* node) { - std::string op_weights_precision = - node->Op()->GetAttrIfExists( - "op_weights_precision"); - return op_weights_precision != "int8"; - }); + auto* fusion_op = + pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu"); auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x"); + auto* filter = pattern->NewNode(filter_repr()) + ->assert_is_op_input("conv2d_xpu", "filter") + ->assert_is_persistable_var(); PDNode* branch = nullptr; if (with_branch_) { branch = pattern->NewNode(branch_repr()) ->assert_is_op_input("conv2d_xpu", "branch"); - fusion_op->LinksFrom({branch}); + fusion_op->LinksFrom({x, branch, filter}); + } else { + fusion_op->LinksFrom({x, filter}); } - fusion_op->LinksFrom({x}); } struct LinkFcPattern : public PatternBase { @@ -103,25 +102,30 @@ struct LinkFcPattern : public PatternBase { PATTERN_DECL_NODE(fusion_op); // declare variable node's name PATTERN_DECL_NODE(x); + PATTERN_DECL_NODE(w); }; LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, name_scope) { - auto* fusion_op = pattern->NewNode(fusion_op_repr()) - ->assert_is_op("fc_xpu") - ->assert_more([&](Node* node) { - std::string op_weights_precision = - node->Op()->GetAttrIfExists( - "op_weights_precision"); - return op_weights_precision != "int8"; - }); - auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x"); + auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu"); - fusion_op->LinksFrom({x}); + auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x"); + auto* w = pattern->NewNode(w_repr()) + ->assert_is_op_input("fc_xpu", "w") + ->assert_is_persistable_var(); + fusion_op->LinksFrom({x, w}); } } // namespace patterns +bool LinkXPUOpMaxPass::IsQuant(Node* weight_node) const { + auto w_dtype = param_scope() + ->FindVar(weight_node->Name()) + ->GetMutable() + ->dtype(); + return w_dtype == phi::DataType::INT8; +} + void LinkXPUOpMaxPass::LinkAddActMax(ir::Graph* graph) const { GraphPatternDetector gpd; patterns::LinkAddActPattern pattern(gpd.mutable_pattern(), name_scope_); @@ -168,16 +172,20 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const { GraphPatternDetector gpd; patterns::LinkConv2dPattern pattern( gpd.mutable_pattern(), name_scope_, with_branch); + auto* scope = param_scope(); int found_subgraph_count = 0; - auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(4) << "handle LinkConv2dMax"; - /* declare operator node's name */ + /* get operator node's name */ GET_IR_NODE(fusion_op); - /* declare variable node's name*/ + /* get variable node's name*/ GET_IR_NODE(x); + GET_IR_NODE(filter); GET_IR_NODE(branch); + if (IsQuant(filter)) { + return; + } auto* fusion_op_desc = fusion_op->Op(); bool fusion_op_has_branch = fusion_op_desc->HasInput("branch"); if (fusion_op_has_branch) { @@ -224,14 +232,17 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const { GraphPatternDetector gpd; patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_); int found_subgraph_count = 0; - + auto* scope = param_scope(); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(4) << "handle LinkFcMax"; - /* declare operator node's name */ + /* get operator node's name */ GET_IR_NODE(fusion_op); - /* declare variable node's name*/ + /* get variable node's name*/ GET_IR_NODE(x); + GET_IR_NODE(w); + + if (IsQuant(w)) return; auto* fusion_op_desc = fusion_op->Op(); auto* x_pre_op = x->inputs[0]->Op(); if (x->inputs.size() > 0 && x->inputs[0]->IsOp() && diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h index cad199ce573bb..a71a2e19cf430 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h @@ -102,6 +102,7 @@ Fused subgraph: */ void LinkAddActMax(ir::Graph* graph) const; + bool IsQuant(Node* weight_node) const; const std::string name_scope_{"link_xpu_op_max_pass"}; }; diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc index a06319250a9cb..8fa4a377175a7 100644 --- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc @@ -286,13 +286,6 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const { desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x")); desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y")); desc.SetAttr("alpha", 1.0f); - if (matmul_v2->Op()->HasAttr("op_weights_precision")) { - desc.SetAttr("op_weights_precision", - matmul_v2->Op()->GetAttr("op_weights_precision")); - } - if (matmul_v2->Op()->HasAttr("weight_scale")) { - desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale")); - } auto matmul_node = graph->CreateOpNode(&desc); IR_NODE_LINK_TO(matmul_x, matmul_node); IR_NODE_LINK_TO(matmul_y, matmul_node); From c30a50c06635305f4a951b48308ed468fe538c8b Mon Sep 17 00:00:00 2001 From: csy0225 Date: Fri, 20 Oct 2023 18:13:43 +0800 Subject: [PATCH 06/15] support fp16 quantize model --- paddle/fluid/framework/ir/CMakeLists.txt | 2 - .../framework/ir/constant_folding_pass.cc | 3 +- .../delete_weight_dequant_linear_op_pass.cc | 2 - paddle/fluid/framework/ir/graph.h | 7 - .../auto_trans_quantize_op_precision_pass.cc | 128 -------- .../xpu/cast_mixed_precision_op_fuse_pass.cc | 44 ++- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 10 +- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 65 ++-- .../framework/ir/xpu/xpu_quantize_op_pass.h | 4 - .../ir/xpu/xpu_quantize_squash_pass.cc | 25 -- .../passes/convert_to_mixed_precision.cc | 7 - .../kernels/fusion/xpu/conv2d_xpu_kernel.cc | 285 +----------------- .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 2 +- 13 files changed, 90 insertions(+), 494 deletions(-) delete mode 100644 paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 42e9a1267e0ee..e41e7d23cd594 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -253,8 +253,6 @@ if(WITH_XPU) pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(xpu_quantize_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(xpu_quantize_squash_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) - pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS - ${XPU_PASS_DEPS}) pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc index 3b3f23933fb6d..ffd6783616052 100644 --- a/paddle/fluid/framework/ir/constant_folding_pass.cc +++ b/paddle/fluid/framework/ir/constant_folding_pass.cc @@ -64,7 +64,8 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const { platform::errors::Fatal( "scope must not be null when applying constant folding.")); - std::vector blacklist{"feed", "matrix_multiply", "save"}; + std::vector blacklist{ + "feed", "matrix_multiply", "save", "dequantize_linear"}; int folded_op_num = 0; auto op_node_sorted = framework::ir::TopologyVarientSort( diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index 52b4a8fce8c12..59f25483c110b 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -117,8 +117,6 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { weight_scale_tensor->dtype())); } - int bit_length = - PADDLE_GET_CONST(int, op->GetAttr("bit_length")); int quant_axis = PADDLE_GET_CONST(int, op->GetAttr("quant_axis")); if (quant_axis == -1) { // per_layer quant_dequant: all OP diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index e29e5a2a9a9d2..e42334aac0593 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -177,11 +177,7 @@ class Graph { platform::errors::AlreadyExists( "The attribute %s to be set already exists in the graph.", attr_name)); - VLOG(1) << "set attribute " << attr_name; attrs_[attr_name] = attr; - VLOG(1) << "attrs_ size " << attrs_.size(); - std::vector attr_names = AttrNames(); - VLOG(1) << "attr_names size " << attr_names.size(); attr_dels_[attr_name] = [attr, attr_name]() { VLOG(3) << "deleting " << attr_name; delete attr; @@ -417,7 +413,6 @@ class Graph { } std::vector AttrNames() const { - VLOG(1) << "graph addr:" << this; if (FLAGS_convert_all_blocks) { if (IsMainGraph()) { return GetSubGraph(0)->AttrNames(); @@ -425,10 +420,8 @@ class Graph { } std::vector res; res.reserve(attrs_.size()); - VLOG(1) << "AttrNames attr size: " << attrs_.size(); for (auto &attr : attrs_) { res.push_back(attr.first); - VLOG(1) << "AttrNames: " << attr.first; } return res; } diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc deleted file mode 100644 index 9fec1091bd9a9..0000000000000 --- a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include - -#include "glog/logging.h" - -#include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" -#include "paddle/fluid/framework/ir/xpu/pass_utils.h" -#include "paddle/fluid/framework/op_version_registry.h" -#include "paddle/fluid/platform/enforce.h" - -namespace phi { -class DenseTensor; -} // namespace phi - -namespace paddle { -namespace framework { -class Scope; -} // namespace framework -} // namespace paddle - -namespace paddle { -namespace framework { -namespace ir { - -class AutoTransQuantizeOpPrecisionPass : public FusePassBase { - protected: - void ApplyImpl(ir::Graph* graph) const override; - void FirstRound(ir::Graph* graph) const; - - const std::string name_scope_{"auto_trans_quantize_op_precision_pass"}; - const std::unordered_set support_fusion_quant_op_type_{ - "conv2d_xpu", "fc_xpu"}; -}; - -static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph, - Node* op_node, - const std::string& arg_name) { - CHECK_EQ(op_node->IsOp(), true); - auto* op_desc = op_node->Op(); - auto out_var_nodes = op_desc->Output(arg_name); - CHECK_EQ(out_var_nodes.size(), 1UL); - auto out_var_name = out_var_nodes[0]; - auto out_var_node = FindNodeWithName(graph, out_var_name); - return out_var_node; -} - -void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const { - auto graph_size = graph->SubGraphsSize(); - VLOG(1) << "There is " << graph_size << " subgraphs need to be handle."; - for (size_t i = 0; i < graph_size; i++) { - auto subgraph = graph->GetSubGraph(i); - VLOG(1) << "Handling the subgraph id: " << i; - for (auto* op_node : TopologySortOperations(*subgraph)) { - auto op_type = op_node->Op()->Type(); - if (support_fusion_quant_op_type_.find(op_type) != - support_fusion_quant_op_type_.end()) { - bool enable_int8 = op_node->Op()->GetAttrIfExists("enable_int8"); - int out_dtype = op_node->Op()->GetAttrIfExists("out_dtype"); - if (enable_int8) { - auto* out_var_node = - GetOpOutVarNodeByArgsName(subgraph, op_node, "out"); - PADDLE_ENFORCE_NOT_NULL( - out_var_node, - platform::errors::InvalidArgument( - "out_var_node in graph cannot be nullptr.")); - bool is_int8_out = true; - for (auto* next_op_node : out_var_node->outputs) { - auto next_op_type = next_op_node->Op()->Type(); - bool is_next_op_support_int8 = - next_op_node->Op()->GetAttrIfExists("enable_int8") && - ((support_fusion_quant_op_type_.find(next_op_type) != - support_fusion_quant_op_type_.end())); - if (!is_next_op_support_int8) { - is_int8_out = false; - break; - } - } - if (is_int8_out) { - op_node->Op()->SetAttr( - "out_dtype", - static_cast(proto::VarType::Type::VarType_Type_INT8)); - out_var_node->Var()->SetDataType( - proto::VarType::Type::VarType_Type_INT8); - VLOG(1) << "The out var node " << out_var_node->Name() - << " is INT8"; - } - } - } - } - } -} - -void AutoTransQuantizeOpPrecisionPass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE_NOT_NULL( - graph, platform::errors::PreconditionNotMet("graph should not be null.")); - Init(name_scope_, graph); - VLOG(1) << "AutoTransQuantizeOpPrecisionPass handling start ..."; - FirstRound(graph); - VLOG(1) << "AutoTransQuantizeOpPrecisionPass handleing end."; -} - -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_PASS(auto_trans_quantize_op_precision_pass, - paddle::framework::ir::AutoTransQuantizeOpPrecisionPass); - -REGISTER_PASS_CAPABILITY(auto_trans_quantize_op_precision_pass) - .AddCombination( - paddle::framework::compatible::OpVersionComparatorCombination() - .EQ("fc_xpu", 0) - .EQ("conv2d_xpu", 0)); diff --git a/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc index ef8759153b0cc..1a56e4d660431 100644 --- a/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc @@ -127,6 +127,7 @@ int CastMixedPrecisionOpFusePass::ApplyCastBeforePass( GraphPatternDetector gpd; patterns::CastBeforePattern pattern( gpd.mutable_pattern(), name_scope_, mixed_precision_op_type); + auto* scope = param_scope(); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -136,7 +137,22 @@ int CastMixedPrecisionOpFusePass::ApplyCastBeforePass( GET_IR_NODE(cast); GET_IR_NODE(cast_out); GET_IR_NODE(mixed_precision_op); - + // Note: conv2d_xpu/fc_xpu not support float32/int8/float16, can not fuse. + if (mixed_precision_op_type == "conv2d_xpu") { + auto filter_name = mixed_precision_op->Op()->Input("filter")[0]; + auto filter_data_type = + scope->FindVar(filter_name)->GetMutable()->dtype(); + if (filter_data_type == phi::DataType::INT8) { + return; + } + } else if (mixed_precision_op_type == "fc_xpu") { + auto w_name = mixed_precision_op->Op()->Input("w")[0]; + auto w_data_type = + scope->FindVar(w_name)->GetMutable()->dtype(); + if (w_data_type == phi::DataType::INT8) { + return; + } + } mixed_precision_op->Op()->RenameInput(cast_out->Name(), cast_in->Name()); IR_NODE_LINK_TO(cast_in, mixed_precision_op); @@ -155,6 +171,7 @@ int CastMixedPrecisionOpFusePass::ApplyCastAfterPass( GraphPatternDetector gpd; patterns::CastAfterPattern pattern( gpd.mutable_pattern(), name_scope_, mixed_precision_op_type); + auto* scope = param_scope(); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, @@ -164,7 +181,30 @@ int CastMixedPrecisionOpFusePass::ApplyCastAfterPass( GET_IR_NODE(cast_in); GET_IR_NODE(cast); GET_IR_NODE(cast_out); - + // Note: conv2d_xpu/fc_xpu not support float16/int8/float32, can not fuse. + if (mixed_precision_op_type == "conv2d_xpu") { + auto filter_name = mixed_precision_op->Op()->Input("filter")[0]; + auto filter_data_type = + scope->FindVar(filter_name)->GetMutable()->dtype(); + auto x_name = mixed_precision_op->Op()->Input("x")[0]; + auto* x_node = FindNodeWithName(graph, x_name); + if (filter_data_type == phi::DataType::INT8 && + x_node->Var()->GetDataType() == + proto::VarType::Type::VarType_Type_FP16) { + return; + } + } else if (mixed_precision_op_type == "fc_xpu") { + auto w_name = mixed_precision_op->Op()->Input("w")[0]; + auto w_data_type = + scope->FindVar(w_name)->GetMutable()->dtype(); + auto x_name = mixed_precision_op->Op()->Input("x")[0]; + auto* x_node = FindNodeWithName(graph, x_name); + if (w_data_type == phi::DataType::INT8 && + x_node->Var()->GetDataType() == + proto::VarType::Type::VarType_Type_FP16) { + return; + } + } mixed_precision_op->Op()->RenameOutput(cast_in->Name(), cast_out->Name()); int out_dtype = proto::VarType::Type::VarType_Type_FP32; mixed_precision_op->Op()->SetAttr("out_dtype", out_dtype); diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index b85ff63131e0b..2ce255e81707a 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -1084,10 +1084,14 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph, {"out", nullptr}, {"out_max", nullptr}}; + auto filter_data_type = scope->FindVar(conv_filter->Name()) + ->GetMutable() + ->dtype(); std::string op_weights_precision = "float32"; - if (conv->Op()->HasAttr("op_weights_precision")) { - op_weights_precision = - conv->Op()->GetAttrIfExists("op_weights_precision"); + if (filter_data_type == phi::DataType::INT8) { + op_weights_precision = "int8"; + } else if (filter_data_type == phi::DataType::FLOAT16) { + op_weights_precision = "float16"; } VLOG(4) << "Conv2d fusion fuse pass is running on " << op_weights_precision << " precision!"; diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index 0c2ab8e67f0d0..49852ec26311d 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -39,28 +39,12 @@ static void MarkAndLogCannotQuantizeOp(Node* op, << " (type: " << op->Op()->Type() << ", id: " << op->id() << ")."; if (details) msg_ss << " " << details; VLOG(2) << msg_ss.str().c_str(); - op->Op()->SetAttr("xpu_op_calc_data_type", std::string("float32")); } void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const { var_quant_scales_ = GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales"); } -// bool XPUQuantizeOpPass::AreScalesPresentForNodes( -// std::initializer_list nodes) const { -// bool present = true; -// for (auto node : nodes) { -// if (var_quant_scales_.count(node->Name()) == 0) { -// present = false; -// } -// } -// return present; -// } - -// float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const { -// return var_quant_scales_.at(node->Name())[0]; -// } - void XPUQuantizeOpPass::QuantizeInput(Graph* g, Node* op, Node* input, @@ -232,27 +216,37 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { } QuantizeInput(graph, n, x_var_node, "x"); - // Branch input - if (branch_var_node != nullptr) { - if (AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node})) { + auto has_output_scale = + AreScalesPresentForNodes(&var_quant_scales_, {out_var_node}); + bool has_branch = branch_var_node != nullptr; + + // Note: Conv2d fusion requres branch datatype is same as output datatype, + // so we should consider branch/output together. + if (has_branch) { + bool has_branch_scale = + AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node}); + if (has_output_scale && has_branch_scale) { QuantizeInput(graph, n, branch_var_node, "branch"); + DequantizeOutput(graph, n, out_var_node, "out"); + // Note: out_dtype attr must be set, because if dequantize_output, we + // consider the kernel out_dtype as int8. + n->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); } else { - n->Op()->SetAttr("xpu_op_force_output_precision", - branch_var_node->Var()->GetDataType()); + n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); } - } - - auto has_output_scale = - AreScalesPresentForNodes(&var_quant_scales_, {out_var_node}); - if (has_output_scale) { - DequantizeOutput(graph, n, out_var_node, "out"); - n->Op()->SetAttr( - "out_dtype", - static_cast(proto::VarType::Type::VarType_Type_INT8)); } else { - n->Op()->SetAttr("xpu_op_force_output_precision", - x_var_node->Var()->GetDataType()); - n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); + if (has_output_scale) { + DequantizeOutput(graph, n, out_var_node, "out"); + // Note: out_dtype attr must be set, because if dequantize_output, we + // consider the kernel out_dtype as int8. + n->Op()->SetAttr( + "out_dtype", + static_cast(proto::VarType::Type::VarType_Type_INT8)); + } else { + n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); + } } } } @@ -303,8 +297,6 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const { "out_dtype", static_cast(proto::VarType::Type::VarType_Type_INT8)); } else { - n->Op()->SetAttr("xpu_op_force_output_precision", - x_var_node->Var()->GetDataType()); n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType()); } } @@ -321,11 +313,8 @@ void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const { platform::errors::InvalidArgument("Scope cannot be nullptr.")); GetQuantInfo(graph); - VLOG(1) << "Get quant info from graph success."; QuantizeConv(graph); - VLOG(1) << "Quantize conv of the graph success."; QuantizeFC(graph); - VLOG(1) << "Quantize fc of the graph success."; } } // namespace ir diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h index 1deb4bebe0dc7..28d0f42e76bde 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h @@ -53,10 +53,6 @@ class XPUQuantizeOpPass : public FusePassBase { void GetQuantInfo(Graph* graph) const; - // bool AreScalesPresentForNodes(std::initializer_list nodes) const; - - // float GetScaleValueForNode(Node* node) const; - mutable std::unordered_map> var_quant_scales_; const std::string name_scope_{"xpu_quantize_op_pass"}; }; diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc index 8571dee220d3b..3f25be65b3c70 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc @@ -104,7 +104,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash( dequant_in->Name()); next_op_desc->SetInput(input_name, input_names); } - if (keep_dequant) GraphSafeRemoveNodes(graph, {quant_op, quant_out}); else @@ -114,30 +113,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash( IR_NODE_LINK_TO(dequant_in, next_op); found_dequant_quant_count++; - } else { - // squash dequantize-quantize to requantize op - // OpDesc desc; - // desc.SetType("requantize"); - // desc.SetInput("Input", - // std::vector({dequant_in->Name()})); - // desc.SetOutput("Output", - // std::vector({quant_out->Name()})); - // desc.SetAttr("Scale_in", dequant_scale); - // desc.SetAttr("Shift_in", dequant_shift); - // desc.SetAttr("Scale_out", quant_scale); - // desc.SetAttr("Shift_out", quant_shift); - - // auto requant_op = g->CreateOpNode(&desc); - - // if (keep_dequant) - // GraphSafeRemoveNodes(graph, {quant_op}); - // else - // GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out}); - - // IR_NODE_LINK_TO(dequant_in, requant_op); - // IR_NODE_LINK_TO(requant_op, quant_out); - - // found_dequant_quant_count++; } }; gpd(graph, handler); diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index c1eeb4c1036bd..3aeeff498a52f 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -17,7 +17,6 @@ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h" #include "paddle/fluid/framework/ir/constant_folding_pass.h" -#include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/identity_op_clean_pass.h" #include "paddle/fluid/inference/io.h" @@ -90,12 +89,6 @@ void ConvertToMixedPrecisionPass::LoadModel() { void ConvertToMixedPrecisionPass::Run() { LoadModel(); - if (backend_ == phi::Backend::XPU) { - framework::ir::DeleteWeightDequantLinearOpPass - delete_weight_dequant_linear_op_pass; - delete_weight_dequant_linear_op_pass.Apply(main_graph_.get()); - } - framework::ir::ConstantFoldingPass constant_folding_pass; constant_folding_pass.Apply(main_graph_.get()); diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc index 9dce663de72c7..6ba3d84b5eb0b 100644 --- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc @@ -71,12 +71,9 @@ void Conv2dXPUKernelImpl(const Context& ctx, int out_c = static_cast(filter_dims[0]); int win_h = static_cast(filter_dims[2]); int win_w = static_cast(filter_dims[3]); - VLOG(1) << "KERNEL1"; auto* input_data = reinterpret_cast(x.data()); - VLOG(1) << "KERNEL1.5"; const float* input_max_data = x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data(); - VLOG(1) << "KERNEL2"; auto* filter_data = reinterpret_cast(filter.data()); auto* filter_max_data = filter_max.data(); auto* scale_max_data = scale_max.get_ptr() == nullptr @@ -94,134 +91,18 @@ void Conv2dXPUKernelImpl(const Context& ctx, branch_data = reinterpret_cast(branch_tensor->data()); } else { - if (branch_tensor->dtype() == phi::DataType::FLOAT32 && - out->dtype() == phi::DataType::INT8) { - VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT32 && " - "out->dtype() == phi::DataType::INT8"; - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::quantization( - ctx.x_context(), - reinterpret_cast(branch_tensor->data()), - branch_data_temp, - branch_tensor->numel(), - branch_max_data); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); - branch_data = reinterpret_cast(branch_data_temp); - } else if (branch_tensor->dtype() == phi::DataType::FLOAT16 && - out->dtype() == phi::DataType::INT8) { - VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT16 && " - "out->dtype() == phi::DataType::INT8"; - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::quantization( - ctx.x_context(), - reinterpret_cast( - branch_tensor->data()), - branch_data_temp, - branch_tensor->numel(), - branch_max_data); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); - branch_data = reinterpret_cast(branch_data_temp); - } else if (branch_tensor->dtype() == phi::DataType::INT8 && - out->dtype() == phi::DataType::FLOAT32) { - VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && " - "out->dtype() == phi::DataType::FLOAT32"; - // if (branch_tensor) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // branch.get_ptr()->dtype(), - // branch.get_ptr()->numel() * sizeof(int8_t)); - // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, - // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_data_quantize_before[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::dequantization( - ctx.x_context(), - reinterpret_cast(branch_tensor->data()), - branch_data_temp, - branch_tensor->numel(), - branch_max_data); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); - // if (branch_tensor) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // phi::DataType::FLOAT32, - // branch.get_ptr()->numel() * sizeof(float)); - // memory_utils::Copy(CPUPlace(), - // static_cast(temp_tensor_cpu.data()), - // ctx.GetPlace(), - // static_cast(branch_data_temp), - // branch.get_ptr()->numel() * sizeof(float)); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_data_quantize_after[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - branch_data = reinterpret_cast(branch_data_temp); - } else if (branch_tensor->dtype() == phi::DataType::INT8 && - out->dtype() == phi::DataType::FLOAT16) { - VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && " - "out->dtype() == phi::DataType::FLOAT16"; - // if (branch_tensor) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // branch.get_ptr()->dtype(), - // branch.get_ptr()->numel() * sizeof(int8_t)); - // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, - // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_data_quantize_before[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::dequantization( - ctx.x_context(), - reinterpret_cast(branch_tensor->data()), - branch_data_temp, - branch_tensor->numel(), - branch_max_data); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization"); - // if (branch_tensor) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // phi::DataType::FLOAT16, - // branch.get_ptr()->numel() * - // sizeof(dtype::float16)); - // memory_utils::Copy(CPUPlace(), - // static_cast(temp_tensor_cpu.data()), - // ctx.GetPlace(), - // static_cast(branch_data_temp), - // branch.get_ptr()->numel() * sizeof(dtype::float16)); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_data_quantize_after[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - branch_data = reinterpret_cast(branch_data_temp); - } else { - auto branch_data_temp = - RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); - int r = xpu::cast( - ctx.x_context(), - reinterpret_cast(branch_tensor->data()), - branch_data_temp, - branch_tensor->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); - branch_data = branch_data_temp; - } + auto branch_data_temp = + RAII_GUARD.alloc_l3_or_gm(branch_tensor->numel()); + int r = xpu::cast( + ctx.x_context(), + reinterpret_cast(branch_tensor->data()), + branch_data_temp, + branch_tensor->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast"); + branch_data = branch_data_temp; } } - VLOG(1) << "KERNEL3"; + const float* bias_data = bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); auto* out_data = @@ -230,133 +111,13 @@ void Conv2dXPUKernelImpl(const Context& ctx, out_max_data = out_max_in.get_ptr() != nullptr ? const_cast(out_max_in.get_ptr()->data()) : out_max_data; - VLOG(1) << "KERNEL4.5"; xpu::Activation_t act(static_cast(act_type)); - VLOG(1) << "KERNEL5"; if (act_type == xpu::Activation_t::LEAKY_RELU) { act.leaky_alpha = act_param; } else if (act_type == xpu::Activation_t::HARD_SIGMOID) { act.hard_sigmoid_slope = act_param; } - // if (input_max_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // x_max.get_ptr()->dtype(), - // x_max.get_ptr()->numel() * sizeof(float)); - // phi::Copy(ctx, *x_max.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) { - // VLOG(1) << "input_max_data[" << i - // << "]:" << temp_tensor_cpu.data()[i]; - // } - // } - - // if (filter_max_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // filter_max.dtype(), - // filter_max.numel() * sizeof(float)); - // phi::Copy(ctx, filter_max, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) { - // VLOG(1) << "filter_max_data[" << i - // << "]:" << temp_tensor_cpu.data()[i]; - // } - // } - - // if (input_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, x.dtype(), x.numel() * sizeof(T_X)); - // phi::Copy(ctx, x, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "input_data[" << i - // << "]:" << static_cast(temp_tensor_cpu.data()[i]); - // } - // } - - // if (filter_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W)); - // phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "filter_data[" << i - // << "]:" << static_cast(temp_tensor_cpu.data()[i]); - // } - // } - // if (bias_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // bias.get_ptr()->dtype(), - // bias.get_ptr()->numel() * sizeof(float)); - // phi::Copy(ctx, *bias.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "bias_data[" << i << "]:" << - // temp_tensor_cpu.data()[i]; - // } - // } - - // if (branch_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // branch.get_ptr()->dtype(), - // branch.get_ptr()->numel() * sizeof(T_OUT)); - // phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_data[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - - // if (branch_max) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // branch_max.get_ptr()->dtype(), - // branch_max.get_ptr()->numel() * sizeof(float)); - // phi::Copy(ctx, *branch_max.get_ptr(), CPUPlace(), false, - // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "branch_max_data[" << i - // << "]:" << temp_tensor_cpu.data()[i]; - // } - // } - - // if (scale_max) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc(&temp_tensor_cpu, - // scale_max.get_ptr()->dtype(), - // scale_max.get_ptr()->numel() * sizeof(float)); - // phi::Copy(ctx, *scale_max.get_ptr(), CPUPlace(), false, - // &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "scale_max_data[" << i - // << "]:" << temp_tensor_cpu.data()[i]; - // } - // } - - // if (filter_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W)); - // phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "filter_data[" << i - // << "]:" << static_cast(temp_tensor_cpu.data()[i]); - // } - // } - - // if (out_max_in.get_ptr()) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, out_max_in.get_ptr()->dtype(), - // out_max_in.get_ptr()->numel() * sizeof(float)); - // phi::Copy(ctx, *out_max_in.get_ptr(), CPUPlace(), false, - // &temp_tensor_cpu); for (size_t i = 0; i < out_max_in.get_ptr()->numel(); - // ++i) { - // VLOG(1) << "output_max_data_before[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } int r = xpu:: conv2d_fusion( // TX/TW/TY/TGEMM /* baidu::xpu::api::Context* ctx */ ctx.x_context(), @@ -383,30 +144,6 @@ void Conv2dXPUKernelImpl(const Context& ctx, /* const float* branch_maxptr */ branch_max_data, /* const float* scale */ scale_max_data); PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu"); - // if (out_data) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, out->dtype(), out->numel() * sizeof(T_OUT)); - // phi::Copy(ctx, *out, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "output_data[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } - - // if (out_max) { - // DenseTensor temp_tensor_cpu; - // ctx.template HostAlloc( - // &temp_tensor_cpu, out_max->dtype(), out_max->numel() * - // sizeof(float)); - // phi::Copy(ctx, *out_max, CPUPlace(), false, &temp_tensor_cpu); - // for (size_t i = 0; i < 50; ++i) { - // VLOG(1) << "output_max_data_after[" << i - // << "]:" << - // static_cast(temp_tensor_cpu.data()[i]); - // } - // } } #define CONV2D_XPU_KERNEL_IMPL(x_dtype_, w_dtype_, out_dtype_, gemm_dtype_) \ @@ -453,7 +190,7 @@ void Conv2dXPUKernel(const Context& ctx, DenseTensor* out, DenseTensor* out_max) { // Dont use template T param - VLOG(1) << "Kernel type: " << x.dtype() << "," << filter.dtype() << " ," + VLOG(4) << "Conv kernel type: " << x.dtype() << " ," << filter.dtype() << " ," << out_dtype; if (x.dtype() == DataType::FLOAT32) { // float32/float16 kernel diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc index eeb36a86eeec7..d6153eff096cb 100644 --- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -133,7 +133,7 @@ void FcXPUKernel(const Context& ctx, DenseTensor* out, DenseTensor* out_max) { // Dont use template T param - VLOG(1) << "Kernel type: " << x.dtype() << " ," << w.dtype() << " ," + VLOG(4) << "Fc kernel type: " << x.dtype() << " ," << w.dtype() << " ," << out_dtype; if (x.dtype() == DataType::FLOAT32) { // float32/float16 kernel From 11da53f712e5c3778e4686ee06fb533be12fc64c Mon Sep 17 00:00:00 2001 From: csy0225 Date: Mon, 23 Oct 2023 10:31:57 +0800 Subject: [PATCH 07/15] code style update --- .../framework/ir/auto_mixed_precision_pass.cc | 2 +- .../ir/delete_quant_dequant_linear_op_pass.cc | 2 -- .../ir/quantize_related_pass_utils.h | 33 ----------------- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 8 ++--- paddle/fluid/framework/ir/xpu/quant_utils.cc | 35 ------------------- paddle/fluid/framework/ir/xpu/quant_utils.h | 2 -- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 32 ++++++++--------- .../ir/xpu/xpu_quantize_squash_pass.cc | 8 ++--- .../ir/xpu/xpu_quantize_squash_pass.h | 31 ++-------------- .../inference/api/paddle_pass_builder.cc | 1 - 10 files changed, 23 insertions(+), 131 deletions(-) diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index fe5ec348bf707..dc93b003383f4 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -771,7 +771,7 @@ void AutoMixedPrecisionPass::SetVarPrecision() const { ->GetMutable(); if (framework::TransToProtoVarType(tensor->type()) != real_in_var_node->Var()->GetDataType()) { - VLOG(1) << "[AutoMixedPrecisionPass] variable " + VLOG(3) << "[AutoMixedPrecisionPass] variable " << real_in_var_node->Name() << "'s proto data type " << real_in_var_node->Var()->GetDataType() << " is different from real dense tensor " diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index 0a9fc07a7cb07..dad7a1c49c194 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -91,7 +91,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { GraphPatternDetector gpd; auto* scope = param_scope(); - BlockDesc* block = nullptr; PADDLE_ENFORCE_NOT_NULL( scope, platform::errors::InvalidArgument( @@ -114,7 +113,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const { return; } */ - block = quantize_linear_op->Op()->Block(); std::unordered_set nodes2rm = {}; // Get input scale from tensor diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h index f8c6358dfcdb1..86f2160d31bc4 100644 --- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h +++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h @@ -28,61 +28,28 @@ static inline void SaveQuantInfoInTheGraph( const std::string& flag, const std::string& key_suffix, const std::unordered_map>& info_map) { - VLOG(1) << "Save quant info in the graph!"; const std::string suffix = "_" + key_suffix + "_" + flag; if (!graph->Has(flag)) { graph->Set(flag, new bool(true)); } for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { - VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix; graph->Set(iter->first + suffix, new std::vector(iter->second)); } } -static inline void GetQuantInfoFromTheGraph( - ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix, - std::unordered_map>* info_map) { - VLOG(1) << "Get quant info from the graph attrs!"; - const std::string suffix = "_" + key_suffix + "_" + flag; - VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0); - if (graph->Has(flag)) { - std::vector attr_names = graph->AttrNames(); - VLOG(1) << "attr_names size:" << attr_names.size(); - for (auto fake_name : attr_names) { - VLOG(1) << "fake_name:" << fake_name; - size_t pos = fake_name.find(suffix); - if (pos != std::string::npos) { - std::string name = fake_name.substr(0, pos); - VLOG(1) << "name:" << name; - auto scales_vector = graph->Get>(fake_name); - VLOG(1) << "scales_vector:" << scales_vector[0]; - info_map->insert(std::make_pair(name, scales_vector)); - } - } - } -} - static inline std::unordered_map> GetQuantInfoFromTheGraph(ir::Graph* graph, const std::string& flag, const std::string& key_suffix) { std::unordered_map> info_map; - VLOG(1) << "Get quant info from the graph attrs!"; const std::string suffix = "_" + key_suffix + "_" + flag; - VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0); if (graph->Has(flag)) { std::vector attr_names = graph->AttrNames(); - VLOG(1) << "attr_names size:" << attr_names.size(); for (auto fake_name : attr_names) { - VLOG(1) << "fake_name:" << fake_name; size_t pos = fake_name.find(suffix); if (pos != std::string::npos) { std::string name = fake_name.substr(0, pos); - VLOG(1) << "name:" << name; auto scales_vector = graph->Get>(fake_name); - VLOG(1) << "scales_vector:" << scales_vector[0]; info_map.insert(std::make_pair(name, scales_vector)); } } diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 2ce255e81707a..6fb76c5dbe457 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -780,9 +780,7 @@ void Conv2dXPUFusePass::CreateFusionInputs( float input_scale = GetScaleValueForNode(var_quant_scales, input); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc conv_input_max_desc(conv_input_max_name); - conv_input_max_desc.SetPersistable( - true); // Need depends on ir_params_sync_among_devices_pass copy to xpu - // device + conv_input_max_desc.SetPersistable(true); conv_input_max_desc.SetShape({static_cast(max_ptr_size)}); conv_input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); conv2d_xpu_input_max = graph->CreateVarNode(&conv_input_max_desc); @@ -828,9 +826,7 @@ void Conv2dXPUFusePass::CreateFusionBranch( if (op_weights_precision == "int8" && !ew_branch_add_max) { int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name); - ew_branch_add_in_max_desc.SetPersistable( - true); // Need depends on ir_params_sync_among_devices_pass copy to - // xpu device + ew_branch_add_in_max_desc.SetPersistable(true); ew_branch_add_in_max_desc.SetShape({static_cast(max_ptr_size)}); ew_branch_add_in_max_desc.SetDataType( proto::VarType::Type::VarType_Type_FP32); diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index 90ca41f72958e..08c1da2148687 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -148,41 +148,6 @@ void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out) { } } -void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out) { - auto* cpu_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - - paddle::experimental::CheckAndTrans2Contiguous(in); - - phi::DenseTensor int8_tensor; - phi::DenseTensor* out_ptr = out == nullptr ? &int8_tensor : out; - out_ptr->Resize(in->dims()); - out_ptr->set_type(phi::DataType::INT8); - out_ptr->set_layout(in->layout()); - - switch (in->dtype()) { - case phi::DataType::FLOAT32: - phi::CastKernel(*cpu_ctx, *in, phi::DataType::INT8, out_ptr); - break; - case phi::DataType::INT8: - if (out == nullptr) { - return; - } else { - phi::AssignKernel(*cpu_ctx, *in, out_ptr); - } - break; - default: - PADDLE_THROW(platform::errors::InvalidArgument( - "Only support fp32, but received dtype is %s.", - phi::DataTypeToString(in->dtype()))); - break; - } - - if (out == nullptr) { - Assign(*out_ptr, in); - } -} - static float FindMaxAbs(const float* data, int len) { float max_f = 0.0f; for (int i = 0; i < len; ++i) { diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h index 30f73023b632d..b564bcac7202d 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.h +++ b/paddle/fluid/framework/ir/xpu/quant_utils.h @@ -25,8 +25,6 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); -void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); - void CastToInt32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); template diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index 49852ec26311d..dc151a12ee2fb 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -65,15 +65,13 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc); quantize_out_node->Var()->SetDataType( proto::VarType::Type::VarType_Type_INT8); - // Create quantize max_ptr node + // Create quantize max_ptr node float scale = GetScaleValueForNode(&var_quant_scales_, input); int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); std::string input_max_name = input->Name() + "_quantize_max"; VarDesc input_max_desc(input_max_name); - input_max_desc.SetPersistable( - true); // Need depends on ir_params_sync_among_devices_pass copy to xpu - // device + input_max_desc.SetPersistable(true); input_max_desc.SetShape({static_cast(max_ptr_size)}); input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); Node* input_max_node = g->CreateVarNode(&input_max_desc); @@ -88,7 +86,7 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, input_scales.data(), max_ptr_size * sizeof(float)); - // create a quantize op node + // Create a quantize op node OpDesc q_desc; q_desc.SetType("quantize_xpu"); q_desc.SetInput("x", std::vector({input->Name()})); @@ -97,12 +95,13 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, q_desc.SetAttr("out_dtype", static_cast(proto::VarType::Type::VarType_Type_INT8)); q_desc.SetAttr("scale", static_cast(scale)); - auto quantize_op = g->CreateOpNode(&q_desc); // OpDesc will be copied. - // update op's input + + // Update op's input op->Op()->SetInput(input_arg_name, std::vector({quantize_out_node->Name()})); - // link quantize op + + // Link quantize op UnlinkNodes(input, op); IR_NODE_LINK_TO(input, quantize_op); IR_NODE_LINK_TO(input_max_node, quantize_op); @@ -137,9 +136,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); std::string input_max_name = output->Name() + "_dequantize_max"; VarDesc input_max_desc(input_max_name); - input_max_desc.SetPersistable( - true); // Need depends on ir_params_sync_among_devices_pass copy to xpu - // device + input_max_desc.SetPersistable(true); input_max_desc.SetShape({static_cast(max_ptr_size)}); input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); Node* input_max_node = g->CreateVarNode(&input_max_desc); @@ -154,7 +151,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, input_scales.data(), max_ptr_size * sizeof(float)); - // create a quantize op node + // Create a quantize op node OpDesc deq_desc; deq_desc.SetType("dequantize_xpu"); deq_desc.SetInput("x", @@ -163,12 +160,13 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, deq_desc.SetOutput("y", std::vector({output->Name()})); deq_desc.SetAttr("out_dtype", static_cast(output->Var()->GetDataType())); deq_desc.SetAttr("scale", static_cast(scale)); - auto dequantize_op = g->CreateOpNode(&deq_desc); // OpDesc will be copied. - // update op's input + + // Update op's input op->Op()->SetOutput(output_arg_name, std::vector({dequantize_in_node->Name()})); - // link dequantize op + + // Link dequantize op UnlinkNodes(op, output); IR_NODE_LINK_TO(op, dequantize_in_node); IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); @@ -220,8 +218,8 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { AreScalesPresentForNodes(&var_quant_scales_, {out_var_node}); bool has_branch = branch_var_node != nullptr; - // Note: Conv2d fusion requres branch datatype is same as output datatype, - // so we should consider branch/output together. + // Note: Conv2d fusion requires branch datatype is same as output + // datatype, so we should consider branch/output together. if (has_branch) { bool has_branch_scale = AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node}); diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc index 3f25be65b3c70..7b1658d8d13aa 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc @@ -91,10 +91,8 @@ void XPUQuantizeSquashPass::DequantQuantSquash( bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; int equal = dequant_scale == quant_scale ? 1 : 0; - if (dequant_scale == quant_scale || isnan(dequant_scale) || - isnan(quant_scale) || isinf(dequant_scale) || isinf(quant_scale)) { + if (dequant_scale == quant_scale) { // squash dequantize-quantize to nothing - auto quant_out_var_name = quant_out->Name(); for (auto input_name : next_op_desc->InputNames()) { auto& input_names = next_op_desc->MutableInputs()->at(input_name); @@ -131,7 +129,6 @@ void XPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const { auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { VLOG(4) << "squash op-dequant ops pair"; - GET_IR_NODE_FROM_SUBGRAPH(any_op, any_op, op_dequant_pattern); GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, op_dequant_pattern); GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, op_dequant_pattern); @@ -275,7 +272,8 @@ void XPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const { FindNodesToKeep(graph, &nodes_keep_counter); DequantQuantSquash(graph, &nodes_keep_counter); OpDequantSquash(graph); - // QuantOpSquash(graph); + // QuantOpSquash(graph); // If the quant op is fused into conv2d_xpu, the + // performance will become worse. MultipleQuantizeSquash(graph); } diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h index fbfa967791304..2d3fbb94f140e 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h @@ -44,34 +44,14 @@ class XPUQuantizeSquashPass : public FusePassBase { std::unordered_map* nodes_keep_counter) const; /* - * Don't squash unsigned dequantize with signed quantize. - * This is important for concat and elementwise ops. - * When inputs have different sign, concat will assume signed type and - * elementwise assumes first input type. - */ - bool IsDequantizeQuantizeIncompatible(Node* quant_op, - Node* dequant_op, - Node* next_op) const; - - /* - * Squash dequantize-quantize ops pairs into requantize or nothing + * Squash dequantize-quantize ops pairs into nothing */ void DequantQuantSquash( Graph* graph, std::unordered_map* nodes_keep_counter) const; /* - * Squash requantize op into conv with scale_out like requantize scale_out - */ - void OpRequantSquash(Graph* graph) const; - - /* - * Squash requantize op if the next operator's input scale can be updated - */ - void RequantOpSquash(Graph* graph) const; - - /* - * Squash dequant if the previous operator has force_fp32_output attribute + * Squash dequant if the previous operator support fp32 out */ void OpDequantSquash(Graph* graph) const; @@ -90,13 +70,6 @@ class XPUQuantizeSquashPass : public FusePassBase { */ void ScaleQuantSquash(Graph* graph) const; - /* - * Squash quantize if is before bfloat16 conv2d or fused_conv2d - */ - void QuantizeBf16Conv(Graph* graph) const; - - void QuantizeBf16ConvImpl(Graph* graph, const std::string& conv_type) const; - /* * Squash quantize if is before conv2d_xpu/fc_xpuy */ diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 41d2ccd67b43a..25c2e0988c419 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -565,7 +565,6 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "cast_mixed_precision_op_fuse_pass", "xpu_quantize_op_pass", "xpu_quantize_squash_pass", - // "auto_trans_quantize_op_precision_pass", "delete_isolated_node_pass", "inplace_op_var_pass", }); From 748bb9dd3affebd0042d14e654c4603938260d9e Mon Sep 17 00:00:00 2001 From: csy0225 Date: Mon, 23 Oct 2023 12:40:38 +0800 Subject: [PATCH 08/15] update quantize/dequantize op yaml --- .../ir/xpu/xpu_graph_pattern_detector.cc | 16 ++----- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 44 +------------------ .../ir/xpu/xpu_quantize_squash_pass.cc | 2 - paddle/phi/api/yaml/ops.yaml | 6 +-- paddle/phi/infermeta/binary.cc | 20 --------- paddle/phi/infermeta/binary.h | 12 ----- paddle/phi/infermeta/unary.cc | 18 ++++++++ paddle/phi/infermeta/unary.h | 10 +++++ .../phi/kernels/xpu/dequantization_kernel.cc | 16 ++++--- paddle/phi/kernels/xpu/quantization_kernel.cc | 16 ++++--- 10 files changed, 52 insertions(+), 108 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc index f74f9c8289d65..f1d2752321aad 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc @@ -57,9 +57,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() { auto *dequant_in = pattern->NewNode(dequant_in_repr()) ->AsInput() ->assert_is_op_input("dequantize_xpu", "x"); - auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr()) - ->AsInput() - ->assert_is_op_input("dequantize_xpu", "max"); auto *dequant_op = pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu"); @@ -68,9 +65,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() { ->AsOutput() ->assert_is_op_output("dequantize_xpu", "y"); - auto *quant_max_in = pattern->NewNode(quant_max_in_repr()) - ->assert_is_op_input("quantize_xpu", "max"); - auto *quant_op = pattern->NewNode(quant_op_repr()) ->assert_is_op("quantize_xpu") ->AsIntermediate(); @@ -81,8 +75,8 @@ PDNode *patterns::DequantQuantXPUAny::operator()() { auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op(); - dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out}); - quant_op->LinksFrom({dequant_out, quant_max_in}).LinksTo({quant_out}); + dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out}); + quant_op->LinksFrom({dequant_out}).LinksTo({quant_out}); next_op->LinksFrom({quant_out}); return quant_out; @@ -92,10 +86,6 @@ PDNode *patterns::OpDequantXPU::operator()() { auto any_op = pattern->NewNode(any_op_repr())->assert_is_op(); auto *dequant_in = pattern->NewNode(dequant_in_repr()) ->assert_is_op_input("dequantize_xpu", "x"); - - auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr()) - ->AsInput() - ->assert_is_op_input("dequantize_xpu", "max"); auto *dequant_op = pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu"); auto dequant_out = pattern->NewNode(dequant_out_repr()) @@ -103,7 +93,7 @@ PDNode *patterns::OpDequantXPU::operator()() { ->assert_is_op_output("dequantize_xpu", "y"); any_op->LinksTo({dequant_in}); - dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out}); + dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out}); return dequant_out; } diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index dc151a12ee2fb..ebeb75763320e 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -66,31 +66,11 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, quantize_out_node->Var()->SetDataType( proto::VarType::Type::VarType_Type_INT8); - // Create quantize max_ptr node - float scale = GetScaleValueForNode(&var_quant_scales_, input); - int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); - std::string input_max_name = input->Name() + "_quantize_max"; - VarDesc input_max_desc(input_max_name); - input_max_desc.SetPersistable(true); - input_max_desc.SetShape({static_cast(max_ptr_size)}); - input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); - Node* input_max_node = g->CreateVarNode(&input_max_desc); - auto input_max_tensor = - scope->Var(input_max_name)->GetMutable(); - input_max_tensor->set_type(phi::DataType::FLOAT32); - input_max_tensor->Resize({max_ptr_size}); - auto* cpu_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - std::vector input_scales(max_ptr_size, scale); - memcpy(cpu_ctx->Alloc(input_max_tensor), - input_scales.data(), - max_ptr_size * sizeof(float)); - // Create a quantize op node + float scale = GetScaleValueForNode(&var_quant_scales_, input); OpDesc q_desc; q_desc.SetType("quantize_xpu"); q_desc.SetInput("x", std::vector({input->Name()})); - q_desc.SetInput("max", std::vector({input_max_name})); q_desc.SetOutput("y", std::vector({quantize_out_node->Name()})); q_desc.SetAttr("out_dtype", static_cast(proto::VarType::Type::VarType_Type_INT8)); @@ -104,7 +84,6 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, // Link quantize op UnlinkNodes(input, op); IR_NODE_LINK_TO(input, quantize_op); - IR_NODE_LINK_TO(input_max_node, quantize_op); IR_NODE_LINK_TO(quantize_op, quantize_out_node); IR_NODE_LINK_TO(quantize_out_node, op); } @@ -131,32 +110,12 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, dequantize_in_node->Var()->SetDataType( proto::VarType::Type::VarType_Type_INT8); - // Create dequantize max_ptr node float scale = GetScaleValueForNode(&var_quant_scales_, output); - int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); - std::string input_max_name = output->Name() + "_dequantize_max"; - VarDesc input_max_desc(input_max_name); - input_max_desc.SetPersistable(true); - input_max_desc.SetShape({static_cast(max_ptr_size)}); - input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); - Node* input_max_node = g->CreateVarNode(&input_max_desc); - auto input_max_tensor = - scope->Var(input_max_name)->GetMutable(); - input_max_tensor->set_type(phi::DataType::FLOAT32); - input_max_tensor->Resize({max_ptr_size}); - auto* cpu_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - std::vector input_scales(max_ptr_size, scale); - memcpy(cpu_ctx->Alloc(input_max_tensor), - input_scales.data(), - max_ptr_size * sizeof(float)); - // Create a quantize op node OpDesc deq_desc; deq_desc.SetType("dequantize_xpu"); deq_desc.SetInput("x", std::vector({dequantize_in_node->Name()})); - deq_desc.SetInput("max", std::vector({input_max_name})); deq_desc.SetOutput("y", std::vector({output->Name()})); deq_desc.SetAttr("out_dtype", static_cast(output->Var()->GetDataType())); deq_desc.SetAttr("scale", static_cast(scale)); @@ -170,7 +129,6 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, UnlinkNodes(op, output); IR_NODE_LINK_TO(op, dequantize_in_node); IR_NODE_LINK_TO(dequantize_in_node, dequantize_op); - IR_NODE_LINK_TO(input_max_node, dequantize_op); IR_NODE_LINK_TO(dequantize_op, output); } diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc index 7b1658d8d13aa..0e6fd9797c177 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc @@ -59,7 +59,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash( Graph* graph, std::unordered_map* nodes_keep_counter) const { GraphPatternDetector gpd; - LOG(INFO) << "DequantQuantSquash COME IN"; patterns::DequantQuantXPUAny squash_pattern{gpd.mutable_pattern(), "dequant_quant_xpu_any"}; squash_pattern(); @@ -90,7 +89,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash( // check if dequantize op should be kept or removed, decrease the counter bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1; - int equal = dequant_scale == quant_scale ? 1 : 0; if (dequant_scale == quant_scale) { // squash dequantize-quantize to nothing auto quant_out_var_name = quant_out->Name(); diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index e41773a39c0dd..33b00e58ce841 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -658,14 +658,13 @@ backward : depthwise_conv2d_grad - op : dequantize_xpu - args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f) + args : (Tensor x, DataType out_dtype, float scale = 1.0f) output : Tensor(y) infer_meta : func : DeQuantizeXPUInferMeta kernel : func : dequantize_xpu data_type: x - optional : max - op : det args : (Tensor x) @@ -2050,14 +2049,13 @@ backward : qr_grad - op : quantize_xpu - args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f) + args : (Tensor x, DataType out_dtype, float scale = 1.0f) output : Tensor(y) infer_meta : func : QuantizeXPUInferMeta kernel : func : quantize_xpu data_type : x - optional : max - op : real args : (Tensor x) diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc index b5599c454fdcc..2aa8543eb82c3 100644 --- a/paddle/phi/infermeta/binary.cc +++ b/paddle/phi/infermeta/binary.cc @@ -978,16 +978,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input, config); } -void DeQuantizeXPUInferMeta(const MetaTensor& x, - const MetaTensor& max, - DataType out_dtype, - float scale, - MetaTensor* y) { - auto x_dims = x.dims(); - y->set_dims(x_dims); - y->set_dtype(out_dtype); -} - void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -2607,16 +2597,6 @@ void PriorBoxInferMeta(const MetaTensor& input, var->set_dims(phi::make_ddim(dim_vec)); } -void QuantizeXPUInferMeta(const MetaTensor& x, - const MetaTensor& max, - DataType out_dtype, - float scale, - MetaTensor* y) { - auto x_dims = x.dims(); - y->set_dims(x_dims); - y->set_dtype(out_dtype); -} - void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x, const MetaTensor& repeats, int dim, diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h index 88e4356b105c4..153a8d553ceb5 100644 --- a/paddle/phi/infermeta/binary.h +++ b/paddle/phi/infermeta/binary.h @@ -155,12 +155,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input, MetaTensor* out, MetaConfig config = MetaConfig()); -void DeQuantizeXPUInferMeta(const MetaTensor& x, - const MetaTensor& max, - DataType out_dtype, - float scale, - MetaTensor* y); - void DistInferMeta(const MetaTensor& x, const MetaTensor& y, float p, @@ -414,12 +408,6 @@ void PriorBoxInferMeta(const MetaTensor& input, MetaTensor* out, MetaTensor* var); -void QuantizeXPUInferMeta(const MetaTensor& x, - const MetaTensor& max, - DataType out_dtype, - float scale, - MetaTensor* y); - void SearchsortedInferMeta(const MetaTensor& sorted_sequence, const MetaTensor& value, bool out_int32, diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc index 243f0b232395e..e6238d0ee5be0 100644 --- a/paddle/phi/infermeta/unary.cc +++ b/paddle/phi/infermeta/unary.cc @@ -672,6 +672,15 @@ void DecodeJpegInferMeta(const MetaTensor& x, } } +void DeQuantizeXPUInferMeta(const MetaTensor& x, + DataType out_dtype, + float scale, + MetaTensor* y) { + auto x_dims = x.dims(); + y->set_dims(x_dims); + y->set_dtype(out_dtype); +} + void DiagEmbedInferMeta( const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) { auto x_dims = x.dims(); @@ -3768,6 +3777,15 @@ void FillSplitOutDims(const MetaTensor& x, } } +void QuantizeXPUInferMeta(const MetaTensor& x, + DataType out_dtype, + float scale, + MetaTensor* y) { + auto x_dims = x.dims(); + y->set_dims(x_dims); + y->set_dtype(out_dtype); +} + void SplitInferMeta(const MetaTensor& x, const IntArray& sections, const Scalar& axis, diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h index d79b53a71097e..8a28d454e42f7 100644 --- a/paddle/phi/infermeta/unary.h +++ b/paddle/phi/infermeta/unary.h @@ -145,6 +145,11 @@ void DecodeJpegInferMeta(const MetaTensor& x, const std::string& mode, MetaTensor* out); +void DeQuantizeXPUInferMeta(const MetaTensor& x, + DataType out_dtype, + float scale, + MetaTensor* y); + void DiagEmbedInferMeta( const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out); @@ -453,6 +458,11 @@ void QrInferMeta(const MetaTensor& x, MetaTensor* q, MetaTensor* r); +void QuantizeXPUInferMeta(const MetaTensor& x, + DataType out_dtype, + float scale, + MetaTensor* y); + void WeightQuantizeInferMeta(const MetaTensor& x, const std::string& algo, MetaTensor* out, diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc index 20423c1eb8920..759a3fd020458 100644 --- a/paddle/phi/kernels/xpu/dequantization_kernel.cc +++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc @@ -19,7 +19,7 @@ namespace phi { template void DeQuantizeKernelImpl(const Context& ctx, const DenseTensor& x, - const paddle::optional& max, + float scale, DenseTensor* y) { using XPUInX = typename XPUTypeTrait::Type; using XPUOutY = typename XPUTypeTrait::Type; @@ -27,9 +27,12 @@ void DeQuantizeKernelImpl(const Context& ctx, auto* y_data = ctx.template Alloc(y); const auto* x_data = x.data(); int64_t len = x.numel(); - const float* max_data = - max.get_ptr() == nullptr ? nullptr : max->data(); - int r = xpu::dequantization( + int max_ptr_size = ctx.x_context()->max_ptr_size(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + auto max_data = RAII_GUARD.alloc_l3_or_gm(max_ptr_size); + int r = xpu::constant(ctx.x_context(), max_data, max_ptr_size, scale); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + r = xpu::dequantization( ctx.x_context(), reinterpret_cast(x_data), reinterpret_cast(y_data), @@ -41,16 +44,15 @@ void DeQuantizeKernelImpl(const Context& ctx, template void DeQuantizeKernel(const Context& ctx, const DenseTensor& x, - const paddle::optional& max, DataType out_dtype, float scale, DenseTensor* y) { switch (out_dtype) { case DataType::FLOAT32: - DeQuantizeKernelImpl(ctx, x, max, y); + DeQuantizeKernelImpl(ctx, x, scale, y); break; case DataType::FLOAT16: - DeQuantizeKernelImpl(ctx, x, max, y); + DeQuantizeKernelImpl(ctx, x, scale, y); break; default: PADDLE_THROW(phi::errors::Unavailable( diff --git a/paddle/phi/kernels/xpu/quantization_kernel.cc b/paddle/phi/kernels/xpu/quantization_kernel.cc index 01f6ddad93aa0..32b28b034e2da 100644 --- a/paddle/phi/kernels/xpu/quantization_kernel.cc +++ b/paddle/phi/kernels/xpu/quantization_kernel.cc @@ -19,7 +19,7 @@ namespace phi { template void QuantizeKernelImpl(const Context& ctx, const DenseTensor& x, - const paddle::optional& max, + float scale, DenseTensor* y) { using XPUInX = typename XPUTypeTrait::Type; using XPUOutY = typename XPUTypeTrait::Type; @@ -27,9 +27,12 @@ void QuantizeKernelImpl(const Context& ctx, auto* y_data = ctx.template Alloc(y); const auto* x_data = x.data(); int64_t len = x.numel(); - const float* max_data = - max.get_ptr() == nullptr ? nullptr : max->data(); - int r = xpu::quantization( + int max_ptr_size = ctx.x_context()->max_ptr_size(); + xpu::ctx_guard RAII_GUARD(ctx.x_context()); + auto max_data = RAII_GUARD.alloc_l3_or_gm(max_ptr_size); + int r = xpu::constant(ctx.x_context(), max_data, max_ptr_size, scale); + PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant"); + r = xpu::quantization( ctx.x_context(), reinterpret_cast(x_data), reinterpret_cast(y_data), @@ -41,16 +44,15 @@ void QuantizeKernelImpl(const Context& ctx, template void QuantizeKernel(const Context& ctx, const DenseTensor& x, - const paddle::optional& max, DataType out_dtype, float scale, DenseTensor* y) { switch (out_dtype) { case DataType::INT16: - QuantizeKernelImpl(ctx, x, max, y); + QuantizeKernelImpl(ctx, x, scale, y); break; case DataType::INT8: - QuantizeKernelImpl(ctx, x, max, y); + QuantizeKernelImpl(ctx, x, scale, y); break; default: PADDLE_THROW(phi::errors::Unavailable( From 5fea223f9060300a2741bc66575f3d7a95e9c40e Mon Sep 17 00:00:00 2001 From: csy0225 Date: Mon, 23 Oct 2023 14:32:00 +0800 Subject: [PATCH 09/15] fix code style --- paddle/fluid/framework/ir/CMakeLists.txt | 7 ++- .../ir/delete_quant_dequant_linear_op_pass.cc | 2 +- .../delete_weight_dequant_linear_op_pass.cc | 2 +- ...d_pass_utils.h => quantize_pass_helper.cc} | 21 +++----- .../fluid/framework/ir/quantize_pass_helper.h | 49 +++++++++++++++++++ .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 20 ++++---- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 17 +------ .../framework/ir/xpu/link_xpu_op_max_pass.cc | 2 - .../framework/ir/xpu/xpu_quantize_op_pass.cc | 2 +- .../ir/xpu/xpu_quantize_squash_pass.cc | 2 - .../ir/xpu/xpu_quantize_squash_pass.h | 10 ---- .../phi/kernels/xpu/dequantization_kernel.cc | 2 +- 12 files changed, 78 insertions(+), 58 deletions(-) rename paddle/fluid/framework/ir/{quantize_related_pass_utils.h => quantize_pass_helper.cc} (81%) create mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index bd9d40bde4702..47e7a9948856c 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -59,6 +59,10 @@ cc_library( placement_pass_base SRCS placement_pass_base.cc DEPS pass) +cc_library( + quantize_pass_helper + SRCS quantize_pass_helper.cc + DEPS pass graph graph_helper) cc_library( coalesce_grad_tensor_pass @@ -241,7 +245,8 @@ if(WITH_XPU) xpu_graph_pattern_detector SRCS xpu/xpu_graph_pattern_detector.cc DEPS graph_pattern_detector) - set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector) + set(XPU_PASS_DEPS quantize_pass_helper xpu_quant_utils xpu_pass_utils + xpu_graph_pattern_detector) pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index c36fd3d4ff269..025cd0c2b7ddd 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -19,7 +19,7 @@ #include #include #include -#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index 59f25483c110b..e30ae85f71c02 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" #include "glog/logging.h" diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_pass_helper.cc similarity index 81% rename from paddle/fluid/framework/ir/quantize_related_pass_utils.h rename to paddle/fluid/framework/ir/quantize_pass_helper.cc index 86f2160d31bc4..730123682f58f 100644 --- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h +++ b/paddle/fluid/framework/ir/quantize_pass_helper.cc @@ -12,18 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. -#pragma once - -#include - -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" namespace paddle { namespace framework { namespace ir { -static inline void SaveQuantInfoInTheGraph( +void SaveQuantInfoInTheGraph( ir::Graph* graph, const std::string& flag, const std::string& key_suffix, @@ -37,10 +32,8 @@ static inline void SaveQuantInfoInTheGraph( } } -static inline std::unordered_map> -GetQuantInfoFromTheGraph(ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix) { +std::unordered_map> GetQuantInfoFromTheGraph( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix) { std::unordered_map> info_map; const std::string suffix = "_" + key_suffix + "_" + flag; if (graph->Has(flag)) { @@ -57,7 +50,7 @@ GetQuantInfoFromTheGraph(ir::Graph* graph, return info_map; } -static inline bool AreScalesPresentForNodes( +bool AreScalesPresentForNodes( std::unordered_map>* var_quant_scales, std::initializer_list nodes) { bool present = true; @@ -69,13 +62,13 @@ static inline bool AreScalesPresentForNodes( return present; } -static inline float GetScaleValueForNode( +float GetScaleValueForNode( std::unordered_map>* var_quant_scales, Node* node) { return var_quant_scales->at(node->Name())[0]; } -static inline std::vector GetScaleVecValueForNode( +std::vector GetScaleVecValueForNode( std::unordered_map>* var_quant_scales, Node* node) { return var_quant_scales->at(node->Name()); diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.h b/paddle/fluid/framework/ir/quantize_pass_helper.h new file mode 100644 index 0000000000000..4876cd35a1cf3 --- /dev/null +++ b/paddle/fluid/framework/ir/quantize_pass_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SaveQuantInfoInTheGraph( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const std::unordered_map>& info_map); + +std::unordered_map> GetQuantInfoFromTheGraph( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix); + +bool AreScalesPresentForNodes( + std::unordered_map>* var_quant_scales, + std::initializer_list nodes); + +float GetScaleValueForNode( + std::unordered_map>* var_quant_scales, + Node* node); + +std::vector GetScaleVecValueForNode( + std::unordered_map>* var_quant_scales, + Node* node); + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 6fb76c5dbe457..09037a0fd60eb 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -515,7 +515,6 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( } // Create fusion_bias_node auto filter_dims = filter_t->dims(); - bool has_bias = with_bn || with_conv_bias; Node* fusion_bias_node = nullptr; if (with_conv_bias) { auto* ew_bias_add_y = @@ -677,7 +676,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( filter_ptr[i] *= scale_val_; } } else { - for (int i = 0; i < weight_scale.size(); i++) { + for (size_t i = 0; i < weight_scale.size(); i++) { weight_scale[i] *= scale_val_; } } @@ -877,12 +876,12 @@ void Conv2dXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("conv node ptr can not be null")); // output && output max std::string conv2d_xpu_out_name; - Node* conv2d_out_op_node = nullptr; Node* conv2d_out_var_node = nullptr; auto* ew_branch_add = GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add"); auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); + auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale"); auto* ew_bias_add = GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add"); if (!act_type.empty()) { @@ -898,7 +897,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs( act != nullptr, true, platform::errors::InvalidArgument("act node ptr can not be null")); - conv2d_out_op_node = act; } else if (ew_branch_add) { auto* ew_branch_add_out = GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_out"); @@ -912,7 +910,14 @@ void Conv2dXPUFusePass::CreateFusionOutputs( true, platform::errors::InvalidArgument( "ew_branch_add node ptr can not be null")); - conv2d_out_op_node = ew_branch_add; + } else if (scale) { + auto* scale_out = GetNodeFromNodesMap(nodes_map, "scale", "scale_out"); + PADDLE_ENFORCE_EQ(scale_out != nullptr, + true, + platform::errors::InvalidArgument( + "scale_out node ptr can not be null")); + conv2d_xpu_out_name = scale_out->Name(); + conv2d_out_var_node = scale_out; } else if (bn) { auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out"); PADDLE_ENFORCE_EQ( @@ -921,7 +926,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("bn_out node ptr can not be null")); conv2d_xpu_out_name = bn_out->Name(); conv2d_out_var_node = bn_out; - conv2d_out_op_node = bn; } else if (ew_bias_add) { auto* ew_bias_add_out = GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out"); @@ -931,7 +935,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs( "ew_bias_add_out node ptr can not be null")); conv2d_xpu_out_name = ew_bias_add_out->Name(); conv2d_out_var_node = ew_bias_add_out; - conv2d_out_op_node = ew_bias_add; } else { auto* conv_out = GetNodeFromNodesMap(nodes_map, "conv", "conv_out"); PADDLE_ENFORCE_EQ( @@ -945,7 +948,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs( conv != nullptr, true, platform::errors::InvalidArgument("conv node ptr can not be null")); - conv2d_out_op_node = conv; } (*fusion_nodes_map)["out"] = conv2d_out_var_node; diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 4e8a6d9d99c73..93ad3aec0d16a 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -381,7 +381,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( } // Create fusion_bias_node auto filter_dims = filter_t->dims(); - bool has_bias = with_bn || with_bias; Node* fusion_bias_node = nullptr; if (with_bias) { auto* ew_bias_add_bias = @@ -390,8 +389,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( true, platform::errors::InvalidArgument( "ew_bias_add_bias node ptr can not be null")); - auto* ew_bias_add_bias_t = scope->FindVar(ew_bias_add_bias->Name()) - ->GetMutable(); PrepareBias(graph, scope, block, ew_bias_add_bias, &fusion_bias_node); } @@ -424,13 +421,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( auto bn_bias_t = scope->Var(bn_bias->Name())->GetMutable(); - PADDLE_ENFORCE_EQ( - filter_dims[0], - bn_bias_t->dims()[0], - platform::errors::InvalidArgument("the shape[%d] of bn bias tensor " - "must equal out_channel[%d] of conv", - bn_bias_t->dims()[0], - filter_dims[0])); auto bn_scale_t = scope->Var(bn_scale->Name())->GetMutable(); auto bn_mean_t = @@ -582,7 +572,6 @@ void FcXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("mul node ptr can not be null")); // output && output max std::string fc_xpu_out_name; - Node* fc_out_op_node = nullptr; Node* fc_out_var_node = nullptr; auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn"); @@ -597,7 +586,6 @@ void FcXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("act_out node ptr can not be null")); fc_xpu_out_name = act_out->Name(); fc_out_var_node = act_out; - fc_out_op_node = act; } else if (bn) { auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out"); PADDLE_ENFORCE_EQ( @@ -606,7 +594,6 @@ void FcXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("bn_out node ptr can not be null")); fc_xpu_out_name = bn_out->Name(); fc_out_var_node = bn_out; - fc_out_op_node = bn; } else if (ew_bias_add) { auto* ew_bias_add_out = GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out"); @@ -616,7 +603,6 @@ void FcXPUFusePass::CreateFusionOutputs( "ew_bias_add_out node ptr can not be null")); fc_xpu_out_name = ew_bias_add_out->Name(); fc_out_var_node = ew_bias_add_out; - fc_out_op_node = ew_bias_add; } else { auto* mul_out = GetNodeFromNodesMap(nodes_map, "mul", "mul_out"); PADDLE_ENFORCE_EQ( @@ -625,7 +611,6 @@ void FcXPUFusePass::CreateFusionOutputs( platform::errors::InvalidArgument("mul_out node ptr can not be null")); fc_xpu_out_name = mul_out->Name(); fc_out_var_node = mul_out; - fc_out_op_node = mul; } (*fusion_nodes_map)["out"] = fc_out_var_node; diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc index bf03a2598726c..9b552bac36f2d 100644 --- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc +++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc @@ -172,7 +172,6 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const { GraphPatternDetector gpd; patterns::LinkConv2dPattern pattern( gpd.mutable_pattern(), name_scope_, with_branch); - auto* scope = param_scope(); int found_subgraph_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { @@ -232,7 +231,6 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const { GraphPatternDetector gpd; patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_); int found_subgraph_count = 0; - auto* scope = param_scope(); auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { VLOG(4) << "handle LinkFcMax"; diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index ebeb75763320e..865464dcd7dca 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h" +#include "paddle/fluid/framework/ir/quantize_pass_helper.h" #include "paddle/utils/string/pretty_log.h" namespace paddle { diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc index 0e6fd9797c177..6161293bf7fb7 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc @@ -66,8 +66,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash( int found_dequant_quant_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { - LOG(INFO) << "squash dequantize-quantize ops pair"; - GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern); GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern); GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern); diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h index 2d3fbb94f140e..d3f37dd42010d 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h @@ -60,16 +60,6 @@ class XPUQuantizeSquashPass : public FusePassBase { */ void MultipleQuantizeSquash(Graph* graph) const; - /* - * Squash scale if dequantize is before scale - */ - void DequantScaleSquash(Graph* graph) const; - - /* - * Squash scale if scale is before quantize - */ - void ScaleQuantSquash(Graph* graph) const; - /* * Squash quantize if is before conv2d_xpu/fc_xpuy */ diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc index 759a3fd020458..9dc9868e75fd9 100644 --- a/paddle/phi/kernels/xpu/dequantization_kernel.cc +++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc @@ -56,7 +56,7 @@ void DeQuantizeKernel(const Context& ctx, break; default: PADDLE_THROW(phi::errors::Unavailable( - "Not supported Quantize data type from %d -> %d ", + "Not supported dequantize data type from %d -> %d ", x.dtype(), out_dtype)); } From 4fa68eba56e820e1cbebe7536bb41b1691032ba8 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 24 Oct 2023 11:13:10 +0800 Subject: [PATCH 10/15] fix link quantize_helper.cc library wrong --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/ir/CMakeLists.txt | 9 +-- .../ir/delete_quant_dequant_linear_op_pass.cc | 2 +- .../delete_weight_dequant_linear_op_pass.cc | 2 +- .../framework/ir/quantize_pass_helper.cc | 79 ------------------- .../fluid/framework/ir/quantize_pass_helper.h | 49 ------------ .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 2 +- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 2 +- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 2 +- 9 files changed, 11 insertions(+), 140 deletions(-) delete mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.cc delete mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b83568cfdd69a..e13025182ed9d 100755 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -55,13 +55,13 @@ function(pass_library TARGET DEST) ${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry - ${pass_library_DEPS}) + quantize_helper ${pass_library_DEPS}) else() cc_library( ${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry - ${pass_library_DEPS}) + quantize_helper ${pass_library_DEPS}) endif() # add more DEST here, such as train, dist and collect USE_PASS into a file automatically. diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 47e7a9948856c..305a11805c9b0 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -60,9 +60,9 @@ cc_library( SRCS placement_pass_base.cc DEPS pass) cc_library( - quantize_pass_helper - SRCS quantize_pass_helper.cc - DEPS pass graph graph_helper) + quantize_helper + SRCS quantize_helper.cc + DEPS graph graph_helper) cc_library( coalesce_grad_tensor_pass @@ -245,8 +245,7 @@ if(WITH_XPU) xpu_graph_pattern_detector SRCS xpu/xpu_graph_pattern_detector.cc DEPS graph_pattern_detector) - set(XPU_PASS_DEPS quantize_pass_helper xpu_quant_utils xpu_pass_utils - xpu_graph_pattern_detector) + set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector) pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS}) diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc index 025cd0c2b7ddd..916d577d23d60 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc @@ -19,7 +19,7 @@ #include #include #include -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" +#include "paddle/fluid/framework/ir/quantize_helper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc index e30ae85f71c02..87f2de2a59e0d 100644 --- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h" -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" +#include "paddle/fluid/framework/ir/quantize_helper.h" #include "glog/logging.h" diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.cc b/paddle/fluid/framework/ir/quantize_pass_helper.cc deleted file mode 100644 index 730123682f58f..0000000000000 --- a/paddle/fluid/framework/ir/quantize_pass_helper.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" - -namespace paddle { -namespace framework { -namespace ir { - -void SaveQuantInfoInTheGraph( - ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix, - const std::unordered_map>& info_map) { - const std::string suffix = "_" + key_suffix + "_" + flag; - if (!graph->Has(flag)) { - graph->Set(flag, new bool(true)); - } - for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { - graph->Set(iter->first + suffix, new std::vector(iter->second)); - } -} - -std::unordered_map> GetQuantInfoFromTheGraph( - ir::Graph* graph, const std::string& flag, const std::string& key_suffix) { - std::unordered_map> info_map; - const std::string suffix = "_" + key_suffix + "_" + flag; - if (graph->Has(flag)) { - std::vector attr_names = graph->AttrNames(); - for (auto fake_name : attr_names) { - size_t pos = fake_name.find(suffix); - if (pos != std::string::npos) { - std::string name = fake_name.substr(0, pos); - auto scales_vector = graph->Get>(fake_name); - info_map.insert(std::make_pair(name, scales_vector)); - } - } - } - return info_map; -} - -bool AreScalesPresentForNodes( - std::unordered_map>* var_quant_scales, - std::initializer_list nodes) { - bool present = true; - for (auto node : nodes) { - if (var_quant_scales->count(node->Name()) == 0) { - present = false; - } - } - return present; -} - -float GetScaleValueForNode( - std::unordered_map>* var_quant_scales, - Node* node) { - return var_quant_scales->at(node->Name())[0]; -} - -std::vector GetScaleVecValueForNode( - std::unordered_map>* var_quant_scales, - Node* node) { - return var_quant_scales->at(node->Name()); -} - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.h b/paddle/fluid/framework/ir/quantize_pass_helper.h deleted file mode 100644 index 4876cd35a1cf3..0000000000000 --- a/paddle/fluid/framework/ir/quantize_pass_helper.h +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include - -#include "paddle/fluid/framework/ir/graph_helper.h" -#include "paddle/fluid/framework/ir/pass.h" - -namespace paddle { -namespace framework { -namespace ir { - -void SaveQuantInfoInTheGraph( - ir::Graph* graph, - const std::string& flag, - const std::string& key_suffix, - const std::unordered_map>& info_map); - -std::unordered_map> GetQuantInfoFromTheGraph( - ir::Graph* graph, const std::string& flag, const std::string& key_suffix); - -bool AreScalesPresentForNodes( - std::unordered_map>* var_quant_scales, - std::initializer_list nodes); - -float GetScaleValueForNode( - std::unordered_map>* var_quant_scales, - Node* node); - -std::vector GetScaleVecValueForNode( - std::unordered_map>* var_quant_scales, - Node* node); - -} // namespace ir -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index 09037a0fd60eb..f4484689d7994 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" +#include "paddle/fluid/framework/ir/quantize_helper.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 93ad3aec0d16a..852bed2b20af0 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" +#include "paddle/fluid/framework/ir/quantize_helper.h" #include "paddle/fluid/framework/ir/xpu/pass_utils.h" #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index 865464dcd7dca..a00879072c30b 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -18,7 +18,7 @@ #include #include -#include "paddle/fluid/framework/ir/quantize_pass_helper.h" +#include "paddle/fluid/framework/ir/quantize_helper.h" #include "paddle/utils/string/pretty_log.h" namespace paddle { From cdeec3964cd6299684b46fa2e7734fb31824a325 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 24 Oct 2023 11:13:32 +0800 Subject: [PATCH 11/15] fix link quantize_helper.cc library wrong --- paddle/fluid/framework/ir/quantize_helper.cc | 79 ++++++++++++++++++++ paddle/fluid/framework/ir/quantize_helper.h | 49 ++++++++++++ 2 files changed, 128 insertions(+) create mode 100644 paddle/fluid/framework/ir/quantize_helper.cc create mode 100644 paddle/fluid/framework/ir/quantize_helper.h diff --git a/paddle/fluid/framework/ir/quantize_helper.cc b/paddle/fluid/framework/ir/quantize_helper.cc new file mode 100644 index 0000000000000..08f2cc457ef2c --- /dev/null +++ b/paddle/fluid/framework/ir/quantize_helper.cc @@ -0,0 +1,79 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/quantize_helper.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SaveQuantInfoInTheGraph( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const std::unordered_map>& info_map) { + const std::string suffix = "_" + key_suffix + "_" + flag; + if (!graph->Has(flag)) { + graph->Set(flag, new bool(true)); + } + for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) { + graph->Set(iter->first + suffix, new std::vector(iter->second)); + } +} + +std::unordered_map> GetQuantInfoFromTheGraph( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix) { + std::unordered_map> info_map; + const std::string suffix = "_" + key_suffix + "_" + flag; + if (graph->Has(flag)) { + std::vector attr_names = graph->AttrNames(); + for (auto fake_name : attr_names) { + size_t pos = fake_name.find(suffix); + if (pos != std::string::npos) { + std::string name = fake_name.substr(0, pos); + auto scales_vector = graph->Get>(fake_name); + info_map.insert(std::make_pair(name, scales_vector)); + } + } + } + return info_map; +} + +bool AreScalesPresentForNodes( + std::unordered_map>* var_quant_scales, + std::initializer_list nodes) { + bool present = true; + for (auto node : nodes) { + if (var_quant_scales->count(node->Name()) == 0) { + present = false; + } + } + return present; +} + +float GetScaleValueForNode( + std::unordered_map>* var_quant_scales, + Node* node) { + return var_quant_scales->at(node->Name())[0]; +} + +std::vector GetScaleVecValueForNode( + std::unordered_map>* var_quant_scales, + Node* node) { + return var_quant_scales->at(node->Name()); +} + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/quantize_helper.h b/paddle/fluid/framework/ir/quantize_helper.h new file mode 100644 index 0000000000000..4876cd35a1cf3 --- /dev/null +++ b/paddle/fluid/framework/ir/quantize_helper.h @@ -0,0 +1,49 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +void SaveQuantInfoInTheGraph( + ir::Graph* graph, + const std::string& flag, + const std::string& key_suffix, + const std::unordered_map>& info_map); + +std::unordered_map> GetQuantInfoFromTheGraph( + ir::Graph* graph, const std::string& flag, const std::string& key_suffix); + +bool AreScalesPresentForNodes( + std::unordered_map>* var_quant_scales, + std::initializer_list nodes); + +float GetScaleValueForNode( + std::unordered_map>* var_quant_scales, + Node* node); + +std::vector GetScaleVecValueForNode( + std::unordered_map>* var_quant_scales, + Node* node); + +} // namespace ir +} // namespace framework +} // namespace paddle From d86c4ce5df96e957d4dca62b0c57493000c59923 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 24 Oct 2023 14:58:05 +0800 Subject: [PATCH 12/15] static check fix --- paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc | 1 - paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 852bed2b20af0..2c516ba46851e 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -380,7 +380,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( weight_scale = GetScaleVecValueForNode(var_quant_scales, mul_w); } // Create fusion_bias_node - auto filter_dims = filter_t->dims(); Node* fusion_bias_node = nullptr; if (with_bias) { auto* ew_bias_add_bias = diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index a00879072c30b..a7db42f8ec951 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -49,7 +49,6 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g, Node* op, Node* input, std::string input_arg_name) const { - auto* scope = param_scope(); auto inputs = op->Op()->InputNames(); bool name_found = std::find(inputs.begin(), inputs.end(), input_arg_name) != inputs.end(); @@ -92,7 +91,6 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g, Node* op, Node* output, std::string output_arg_name) const { - auto* scope = param_scope(); auto outputs = op->Op()->OutputNames(); bool name_found = std::find(outputs.begin(), outputs.end(), output_arg_name) != @@ -166,7 +164,8 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const { out_var_node = output_node; } } - if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) { + if (!AreScalesPresentForNodes(&var_quant_scales_, + {x_var_node, w_var_node})) { MarkAndLogCannotQuantizeOp(n, "No scale available for the operator"); return; } From 9a3c5392c03459b7687985227cb0678925bb29a0 Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 24 Oct 2023 15:30:47 +0800 Subject: [PATCH 13/15] remove use mutable_data func and use data func instead --- .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc | 27 +++++++------------ .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 21 +++++---------- .../framework/ir/xpu/xpu_quantize_op_pass.cc | 3 ++- 3 files changed, 18 insertions(+), 33 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc index f4484689d7994..89a558c6601f1 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc @@ -577,14 +577,10 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( auto bn_mean_t = scope->Var(bn_mean->Name())->GetMutable(); auto bn_var_t = scope->Var(bn_var->Name())->GetMutable(); - float* bn_scale_ptr = - bn_scale_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_bias_ptr = - bn_bias_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_mean_ptr = - bn_mean_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_var_ptr = - bn_var_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_scale_ptr = bn_scale_t->data(); + float* bn_bias_ptr = bn_bias_t->data(); + float* bn_mean_ptr = bn_mean_t->data(); + float* bn_var_ptr = bn_var_t->data(); auto mean_len = bn_mean_t->numel(); auto filter_stride = filter_len / mean_len; float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); @@ -594,24 +590,21 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( auto fusion_bias_t = scope->Var(fusion_bias_node->Name())->GetMutable(); - float* fusion_bias_ptr = - fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + float* fusion_bias_ptr = fusion_bias_t->data(); // recompute bias and weights for (int i = 0; i < mean_len; ++i) { bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); } // recompute the weights if (op_weights_precision != "int8") { - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); + float* filter_ptr = filter_t->data(); for (int i = 0; i < mean_len; ++i) { for (int j = 0; j < filter_stride; j++) { filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; } } } else { - int8_t* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); + int8_t* filter_ptr = filter_t->data(); PADDLE_ENFORCE_EQ( weight_scale.size(), mean_len, @@ -659,8 +652,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( // recompute bias as scale op auto fusion_bias_t = scope->GetVar(fusion_bias_node->Name())->GetMutable(); - float* fusion_bias_ptr = - fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + float* fusion_bias_ptr = fusion_bias_t->data(); for (int i = 0; i < bias_len; ++i) { if (bias_after_scale_) { fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_; @@ -670,8 +662,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias( } // recompute weight as scale op if (op_weights_precision != "int8") { - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); + float* filter_ptr = filter_t->data(); for (int i = 0; i < filter_len; ++i) { filter_ptr[i] *= scale_val_; } diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 2c516ba46851e..373275706700f 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -425,14 +425,10 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( auto bn_mean_t = scope->Var(bn_mean->Name())->GetMutable(); auto bn_var_t = scope->Var(bn_var->Name())->GetMutable(); - float* bn_scale_ptr = - bn_scale_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_bias_ptr = - bn_bias_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_mean_ptr = - bn_mean_t->mutable_data(paddle::platform::CPUPlace()); - float* bn_var_ptr = - bn_var_t->mutable_data(paddle::platform::CPUPlace()); + float* bn_scale_ptr = bn_scale_t->data(); + float* bn_bias_ptr = bn_bias_t->data(); + float* bn_mean_ptr = bn_mean_t->data(); + float* bn_var_ptr = bn_var_t->data(); auto mean_len = bn_mean_t->numel(); auto filter_stride = filter_len / mean_len; float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon")); @@ -442,24 +438,21 @@ void FcXPUFusePass::CreateFusionWeightsAndBias( auto fusion_bias_t = scope->Var(fusion_bias_node->Name())->GetMutable(); - float* fusion_bias_ptr = - fusion_bias_t->mutable_data(paddle::platform::CPUPlace()); + float* fusion_bias_ptr = fusion_bias_t->data(); // recompute bias and weights for (int i = 0; i < mean_len; ++i) { bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon); } // recompute the weights if (op_weights_precision != "int8") { - float* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); + float* filter_ptr = filter_t->data(); for (int i = 0; i < mean_len; ++i) { for (int j = 0; j < filter_stride; j++) { filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i]; } } } else { - int8_t* filter_ptr = - filter_t->mutable_data(paddle::platform::CPUPlace()); + int8_t* filter_ptr = filter_t->data(); PADDLE_ENFORCE_EQ( weight_scale.size(), mean_len, diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc index a7db42f8ec951..761f17a92e299 100644 --- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc +++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc @@ -237,7 +237,8 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const { out_var_node = output_node; } } - if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) { + if (!AreScalesPresentForNodes(&var_quant_scales_, + {x_var_node, w_var_node})) { MarkAndLogCannotQuantizeOp(n, "No scale available for the operator"); return; } From 7c9255ec96b5ad9f5267c2ad3881274279d8925f Mon Sep 17 00:00:00 2001 From: csy0225 Date: Tue, 24 Oct 2023 18:38:30 +0800 Subject: [PATCH 14/15] remove old prepare weight func --- .../ir/xpu/conv2d_transpose_xpu_fuse_pass.cc | 10 +- .../xpu/fused_multi_transformer_xpu_pass.cc | 20 +- .../ir/xpu/multi_encoder_xpu_fuse_pass.cc | 24 +- paddle/fluid/framework/ir/xpu/pass_utils.cc | 223 ++++++------------ paddle/fluid/framework/ir/xpu/pass_utils.h | 16 +- paddle/fluid/framework/ir/xpu/quant_utils.cc | 51 +--- paddle/fluid/framework/ir/xpu/quant_utils.h | 8 - .../inference/analysis/passes/CMakeLists.txt | 9 +- .../passes/convert_to_mixed_precision.cc | 1 - 9 files changed, 120 insertions(+), 242 deletions(-) diff --git a/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc index 784d5d4ec029f..51ebb63c563dc 100644 --- a/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc @@ -377,8 +377,14 @@ int Conv2dTransposeXPUFusePass::ApplyImpl(ir::Graph* graph, // filter max Node* filter_int16 = nullptr; Node* filter_max = nullptr; - PrepareWeight( - graph, scope, block, conv_filter, &filter_int16, &filter_max, false); + PrepareWeight(graph, + scope, + block, + conv_filter, + &filter_int16, + &filter_max, + false, + std::vector({})); // output && output max std::string conv2d_xpu_out_name; if (!act_type.empty()) { diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc index 725f4e6a86a49..47bf2b06be9d9 100644 --- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc @@ -424,11 +424,23 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant( nullptr, platform::errors::Fatal("w node should not be nullptr")); if (quant_post_dynamic_weight_precision == 0) { - PrepareWeight( - graph, scope, block, w_node, &w_intx, &w_max, need_transpose); + PrepareWeight(graph, + scope, + block, + w_node, + &w_intx, + &w_max, + need_transpose, + std::vector({})); } else { - PrepareWeight( - graph, scope, block, w_node, &w_intx, &w_max, need_transpose); + PrepareWeight(graph, + scope, + block, + w_node, + &w_intx, + &w_max, + need_transpose, + std::vector({})); } w_nodes->push_back(w_node); w_intx_nodes->push_back(w_intx); diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc index 255c1f5d47a4c..04439608aaa23 100644 --- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc @@ -561,7 +561,8 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(Graph* graph, &q_w_fp32_t, &k_w_fp32_t, &v_w_fp32_t}; phi::ConcatKernel(*cpu_ctx, in_tensors, 0, &qkv_w_int16_t); - PrepareWeight(&qkv_w_int16_t, &qkv_w_max_t, false); + ConvertWithQuant( + &qkv_w_int16_t, &qkv_w_max_t, false, std::vector({})); size_t qkv_w_int16_hash = HashTensor(qkv_w_int16_t); size_t qkv_w_max_hash = HashTensor(qkv_w_max_t); std::string qkv_w_int16_name = std::to_string(qkv_w_int16_hash); @@ -813,16 +814,17 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse( &qkv_w_int16, &qkv_w_max); -#define PREPARE_QKV_MATMUL_W(idx_) \ - Node* qkv_matmul_##idx_##_w_int16 = nullptr; \ - Node* qkv_matmul_##idx_##_w_max = nullptr; \ - PrepareWeight(graph, \ - scope, \ - block, \ - qkv_matmul_##idx_##_w, \ - &qkv_matmul_##idx_##_w_int16, \ - &qkv_matmul_##idx_##_w_max, \ - true); +#define PREPARE_QKV_MATMUL_W(idx_) \ + Node* qkv_matmul_##idx_##_w_int16 = nullptr; \ + Node* qkv_matmul_##idx_##_w_max = nullptr; \ + PrepareWeight(graph, \ + scope, \ + block, \ + qkv_matmul_##idx_##_w, \ + &qkv_matmul_##idx_##_w_int16, \ + &qkv_matmul_##idx_##_w_max, \ + true, \ + std::vector({})); PREPARE_QKV_MATMUL_W(1); PREPARE_QKV_MATMUL_W(2); PREPARE_QKV_MATMUL_W(3); diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc index b895033108e12..c6dc291315399 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.cc +++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc @@ -133,79 +133,78 @@ void PrepareWeight(Graph* graph, Scope* scope, BlockDesc* block, Node* weight, - Node** quant_weight, - Node** quant_weight_max, + Node** dst_weight, + Node** dst_weight_max, bool transpose, const std::vector& weight_scales) { auto weight_name = weight->Name(); auto* weight_tensor = scope->Var(weight_name)->GetMutable(); - phi::DenseTensor quant_weight_tensor; - Assign(*weight_tensor, &quant_weight_tensor); - phi::DenseTensor quant_weight_max_tensor; + phi::DenseTensor dst_weight_tensor; + Assign(*weight_tensor, &dst_weight_tensor); + phi::DenseTensor dst_weight_max_tensor; ConvertWeightWrapper( - &quant_weight_tensor, &quant_weight_max_tensor, transpose, weight_scales); - size_t quant_weight_hash = HashTensor(quant_weight_tensor); - size_t quant_weight_max_hash = HashTensor(quant_weight_max_tensor); + &dst_weight_tensor, &dst_weight_max_tensor, transpose, weight_scales); + size_t dst_weight_hash = HashTensor(dst_weight_tensor); + size_t dst_weight_max_hash = HashTensor(dst_weight_max_tensor); std::string pre_name = GetPrefixWithoutHash(weight_name); - std::string quant_weight_name = - pre_name + "_#" + std::to_string(quant_weight_hash); - std::string quant_weight_max_name = - pre_name + "_max_#" + std::to_string(quant_weight_max_hash); - *quant_weight = FindNodeWithName(graph, quant_weight_name); - if (*quant_weight == nullptr) { - // Create quant_weight node - // Update quant_weight var_desc in block - VarDesc quant_weight_desc(quant_weight_name); - quant_weight_desc.SetPersistable(true); - quant_weight_desc.SetShape(vectorize(quant_weight_tensor.dims())); - quant_weight_desc.SetDataType( - framework::TransToProtoVarType(quant_weight_tensor.dtype())); - *quant_weight = graph->CreateVarNode(&quant_weight_desc); - auto* block_quant_weight_desc = block->Var(quant_weight_name); - block_quant_weight_desc->SetPersistable(quant_weight_desc.Persistable()); - block_quant_weight_desc->SetShape(quant_weight_desc.GetShape()); - block_quant_weight_desc->SetDataType(quant_weight_desc.GetDataType()); - // Create quant_weight_max node - // Update quant_weight_max var_desc in block - VarDesc quant_weight_max_desc(quant_weight_max_name); - quant_weight_max_desc.SetPersistable(true); - quant_weight_max_desc.SetShape(vectorize(quant_weight_max_tensor.dims())); - quant_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); - *quant_weight_max = graph->CreateVarNode(&quant_weight_max_desc); - auto* block_quant_weight_max_desc = block->Var(quant_weight_max_name); - block_quant_weight_max_desc->SetPersistable( - quant_weight_max_desc.Persistable()); - block_quant_weight_max_desc->SetShape(quant_weight_max_desc.GetShape()); - block_quant_weight_max_desc->SetDataType( - quant_weight_max_desc.GetDataType()); + std::string dst_weight_name = + pre_name + "_#" + std::to_string(dst_weight_hash); + std::string dst_weight_max_name = + pre_name + "_max_#" + std::to_string(dst_weight_max_hash); + *dst_weight = FindNodeWithName(graph, dst_weight_name); + if (*dst_weight == nullptr) { + // Create dst_weight node + // Update dst_weight var_desc in block + VarDesc dst_weight_desc(dst_weight_name); + dst_weight_desc.SetPersistable(true); + dst_weight_desc.SetShape(vectorize(dst_weight_tensor.dims())); + dst_weight_desc.SetDataType( + framework::TransToProtoVarType(dst_weight_tensor.dtype())); + *dst_weight = graph->CreateVarNode(&dst_weight_desc); + auto* block_dst_weight_desc = block->Var(dst_weight_name); + block_dst_weight_desc->SetPersistable(dst_weight_desc.Persistable()); + block_dst_weight_desc->SetShape(dst_weight_desc.GetShape()); + block_dst_weight_desc->SetDataType(dst_weight_desc.GetDataType()); + // Create dst_weight_max node + // Update dst_weight_max var_desc in block + VarDesc dst_weight_max_desc(dst_weight_max_name); + dst_weight_max_desc.SetPersistable(true); + dst_weight_max_desc.SetShape(vectorize(dst_weight_max_tensor.dims())); + dst_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); + *dst_weight_max = graph->CreateVarNode(&dst_weight_max_desc); + auto* block_dst_weight_max_desc = block->Var(dst_weight_max_name); + block_dst_weight_max_desc->SetPersistable( + dst_weight_max_desc.Persistable()); + block_dst_weight_max_desc->SetShape(dst_weight_max_desc.GetShape()); + block_dst_weight_max_desc->SetDataType(dst_weight_max_desc.GetDataType()); // Find dst/dst_max variable in scope - auto* quant_weight_var = scope->FindVar(quant_weight_name); - if (quant_weight_var == nullptr) { - // Create quant_weight/quant_weight_max variable/tensor - Assign(quant_weight_tensor, - scope->Var(quant_weight_name)->GetMutable()); - Assign(quant_weight_max_tensor, - scope->Var(quant_weight_max_name)->GetMutable()); + auto* dst_weight_var = scope->FindVar(dst_weight_name); + if (dst_weight_var == nullptr) { + // Create dst_weight/dst_weight_max variable/tensor + Assign(dst_weight_tensor, + scope->Var(dst_weight_name)->GetMutable()); + Assign(dst_weight_max_tensor, + scope->Var(dst_weight_max_name)->GetMutable()); } else { // Share the same variable PADDLE_ENFORCE_NOT_NULL( - scope->FindVar(quant_weight_max_name), - platform::errors::Fatal("quant_weight_max(%s) variable should not be " - "nullptr if quant_weight(%s) " + scope->FindVar(dst_weight_max_name), + platform::errors::Fatal("dst_weight_max(%s) variable should not be " + "nullptr if dst_weight(%s) " "variable is exist. (weight_name is %s)", - quant_weight_max_name, - quant_weight_name, + dst_weight_max_name, + dst_weight_name, weight_name)); } } else { - *quant_weight_max = FindNodeWithName(graph, quant_weight_max_name); + *dst_weight_max = FindNodeWithName(graph, dst_weight_max_name); PADDLE_ENFORCE_NOT_NULL( - *quant_weight_max, - platform::errors::Fatal("quant_weight_max(%s) variable should not be " - "nullptr if quant_weight(%s) " + *dst_weight_max, + platform::errors::Fatal("dst_weight_max(%s) variable should not be " + "nullptr if dst_weight(%s) " "variable is exist. (weight_name is %s)", - quant_weight_max_name, - quant_weight_name, + dst_weight_max_name, + dst_weight_name, weight_name)); } } @@ -215,112 +214,30 @@ template void PrepareWeight( Scope* scope, BlockDesc* block, Node* weight, - Node** quant_weight, - Node** quant_weight_max, + Node** dst_weight, + Node** dst_weight_max, bool transpose, const std::vector& weight_scales); -template void PrepareWeight( +template void PrepareWeight( Graph* graph, Scope* scope, BlockDesc* block, Node* weight, - Node** quant_weight, - Node** quant_weight_max, + Node** dst_weight, + Node** dst_weight_max, bool transpose, const std::vector& weight_scales); -template -void PrepareWeight(Graph* graph, - Scope* scope, - BlockDesc* block, - Node* src, - Node** dst, - Node** dst_max, - bool transpose) { - auto src_name = src->Name(); - auto* src_tensor = scope->Var(src_name)->GetMutable(); - - phi::DenseTensor dst_tensor; - Assign(*src_tensor, &dst_tensor); - phi::DenseTensor dst_max_tensor; - PrepareWeight(&dst_tensor, &dst_max_tensor, transpose); - - size_t dst_hash = HashTensor(dst_tensor); - size_t dst_max_hash = HashTensor(dst_max_tensor); - std::string pre_name = GetPrefixWithoutHash(src_name); - std::string dst_name = pre_name + "_#" + std::to_string(dst_hash); - std::string dst_max_name = pre_name + "_max_#" + std::to_string(dst_max_hash); - *dst = FindNodeWithName(graph, dst_name); - if (*dst == nullptr) { - // Create dst node - // Update dst var_desc in block - VarDesc dst_desc(dst_name); - dst_desc.SetPersistable(true); - dst_desc.SetShape(vectorize(dst_tensor.dims())); - dst_desc.SetDataType(framework::TransToProtoVarType(dst_tensor.dtype())); - *dst = graph->CreateVarNode(&dst_desc); - auto* block_dst_desc = block->Var(dst_name); - block_dst_desc->SetPersistable(dst_desc.Persistable()); - block_dst_desc->SetShape(dst_desc.GetShape()); - block_dst_desc->SetDataType(dst_desc.GetDataType()); - // Create dst_max node - // Update dst_max var_desc in block - VarDesc dst_max_desc(dst_max_name); - dst_max_desc.SetPersistable(true); - dst_max_desc.SetShape(vectorize(dst_max_tensor.dims())); - dst_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32); - *dst_max = graph->CreateVarNode(&dst_max_desc); - auto* block_dst_max_desc = block->Var(dst_max_name); - block_dst_max_desc->SetPersistable(dst_max_desc.Persistable()); - block_dst_max_desc->SetShape(dst_max_desc.GetShape()); - block_dst_max_desc->SetDataType(dst_max_desc.GetDataType()); - - // Find dst/dst_max variable in scope - auto* dst_var = scope->FindVar(dst_name); - if (dst_var == nullptr) { - // Create dst/dst_max variable/tensor - Assign(dst_tensor, scope->Var(dst_name)->GetMutable()); - Assign(dst_max_tensor, - scope->Var(dst_max_name)->GetMutable()); - } else { - // Share the same variable - PADDLE_ENFORCE_NOT_NULL( - scope->FindVar(dst_max_name), - platform::errors::Fatal( - "dst_max(%s) variable should not be nullptr if dst(%s) " - "variable is exist. (src_name is %s)", - dst_max_name, - dst_name, - src_name)); - } - } else { - *dst_max = FindNodeWithName(graph, dst_max_name); - PADDLE_ENFORCE_NOT_NULL( - *dst_max, - platform::errors::Fatal( - "dst_max(%s) variable should not be nullptr if dst(%s) " - "variable is exist. (src_name is %s)", - dst_max_name, - dst_name, - src_name)); - } -} - -template void PrepareWeight(Graph* graph, - Scope* scope, - BlockDesc* block, - Node* src, - Node** dst, - Node** dst_max, - bool transpose); -template void PrepareWeight(Graph* graph, - Scope* scope, - BlockDesc* block, - Node* src, - Node** dst, - Node** dst_max, - bool transpose); +template void PrepareWeight( + Graph* graph, + Scope* scope, + BlockDesc* block, + Node* weight, + Node** dst_weight, + Node** dst_weight_max, + bool transpose, + const std::vector& weight_scales); void PrepareBias( Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst) { diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h index 417ba361e4348..668519c8eb406 100644 --- a/paddle/fluid/framework/ir/xpu/pass_utils.h +++ b/paddle/fluid/framework/ir/xpu/pass_utils.h @@ -79,22 +79,16 @@ void ConvertWeightWrapper(phi::DenseTensor* weight, ConvertWithoutQuant(weight, weight_max, transpose, weight_scales); } -template -void PrepareWeight(Graph* graph, - Scope* scope, - BlockDesc* block, - Node* src, - Node** dst, - Node** dst_max, - bool transpose); - +// 1. Quant weight from fp32 to int16/int31/int8 +// 2. Weight data is in-place update. +// 3. Generate weight max tensor template void PrepareWeight(Graph* graph, Scope* scope, BlockDesc* block, Node* weight, - Node** quant_weight, - Node** quant_weight_max, + Node** dst_weight, + Node** dst_weight_max, bool transpose, const std::vector& weight_scales); diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index 08c1da2148687..a137a006e9f70 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -281,11 +281,6 @@ void ConvertWithQuant(phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose, const std::vector& weight_scales) { - if (!weight_scales.empty()) { - LOG(FATAL) << "Weight scales should be empty(), otherwise, check if your " - "model is quant model or not."; - } - // Convert fp16 to fp32 phi::DenseTensor weight_fp32; CastToFp32(weight, &weight_fp32); @@ -349,51 +344,17 @@ template void ConvertWithQuant( bool transpose, const std::vector& weight_scales); -template void ConvertWithoutQuant( +template void ConvertWithQuant( phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose, const std::vector& weight_scales); -template -void PrepareWeight(phi::DenseTensor* weight, - phi::DenseTensor* weight_max, - bool transpose) { - // Convert fp16 to fp32 - phi::DenseTensor weight_fp32; - CastToFp32(weight, &weight_fp32); - - // Transpose - if (transpose) { - Transpose2D(&weight_fp32); - } - - // Find max - int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1); - int size = weight_fp32.numel(); - auto* weight_data = weight_fp32.data(); - float max_val = FindMaxAbs(weight_data, size); - std::vector max_vec(max_ptr_size, max_val); - weight_max->set_type(phi::DataType::FLOAT32); - weight_max->Resize({max_ptr_size}); - auto* cpu_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - memcpy(cpu_ctx->Alloc(weight_max), - max_vec.data(), - max_ptr_size * sizeof(float)); - - // Quant - weight->set_type(phi::CppTypeToDataType::Type()); - weight->Resize(weight_fp32.dims()); - QuantFP32ToIntX(weight_data, cpu_ctx->Alloc(weight), max_val, size); -} - -template void PrepareWeight(phi::DenseTensor* weight, - phi::DenseTensor* weight_max, - bool transpose); -template void PrepareWeight(phi::DenseTensor* weight, - phi::DenseTensor* weight_max, - bool transpose); +template void ConvertWithoutQuant( + phi::DenseTensor* weight, + phi::DenseTensor* weight_max, + bool transpose, + const std::vector& weight_scales); bool IsPerTensorQuant(const std::vector& weight_max) { bool per_tensor = true; diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h index b564bcac7202d..1a2952c614542 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.h +++ b/paddle/fluid/framework/ir/xpu/quant_utils.h @@ -51,14 +51,6 @@ void ConvertWithQuant(phi::DenseTensor* weight, bool transpose, const std::vector& weight_scales); -// 1. Quant weight from fp32 to int16/int31 -// 2. Weight data is in-place update. -// 3. Generate weight max tensor -template -void PrepareWeight(phi::DenseTensor* weight, - phi::DenseTensor* weight_max, - bool transpose); - bool IsPerTensorQuant(const std::vector& weight_max); } // namespace ir diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt index 2561e14d06d1e..0af6876faca05 100644 --- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt @@ -13,13 +13,8 @@ cc_library( cc_library( convert_to_mixed_precision SRCS convert_to_mixed_precision.cc - DEPS analysis_pass - ir_graph_build_pass - auto_mixed_precision_pass - constant_folding_pass - identity_op_clean_pass - delete_quant_dequant_linear_op_pass - delete_weight_dequant_linear_op_pass) + DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass + constant_folding_pass identity_op_clean_pass) cc_library( ir_params_sync_among_devices_pass SRCS ir_params_sync_among_devices_pass.cc diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index 3aeeff498a52f..d706113307009 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -95,7 +95,6 @@ void ConvertToMixedPrecisionPass::Run() { framework::ir::AutoMixedPrecisionPass auto_mixed_precision_pass; auto_mixed_precision_pass.Set("mixed_precision_mode", new int{static_cast(mixed_precision_)}); - if (backend_ == phi::Backend::GPU) { auto_mixed_precision_pass.Set("enable_gpu_mixed", new bool{true}); } else if (backend_ == phi::Backend::XPU) { From 426c36b117636f1e9ca05139dabeccbfd4628dcb Mon Sep 17 00:00:00 2001 From: csy0225 Date: Wed, 25 Oct 2023 15:15:30 +0800 Subject: [PATCH 15/15] move dequantize/quantize ops yaml pos --- paddle/phi/api/yaml/fused_ops.yaml | 18 ++++++++++++++++++ paddle/phi/api/yaml/ops.yaml | 18 ------------------ 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml index cf1cc6ebcd295..a86aac572c263 100644 --- a/paddle/phi/api/yaml/fused_ops.yaml +++ b/paddle/phi/api/yaml/fused_ops.yaml @@ -71,6 +71,15 @@ data_type : x optional : bias, branch, branch_max ,x_max, scale_max, out_max_in +- op : dequantize_xpu + args : (Tensor x, DataType out_dtype, float scale = 1.0f) + output : Tensor(y) + infer_meta : + func : DeQuantizeXPUInferMeta + kernel : + func : dequantize_xpu + data_type: x + - op : embedding_with_eltwise_add_xpu args : (Tensor[] ids, Tensor[] tables, Tensor mask, int64_t padding_idx) output: Tensor(out), Tensor(seq_lod), Tensor(max_seq_len) @@ -254,6 +263,15 @@ data_type : input optional : bias_qk +- op : quantize_xpu + args : (Tensor x, DataType out_dtype, float scale = 1.0f) + output : Tensor(y) + infer_meta : + func : QuantizeXPUInferMeta + kernel : + func : quantize_xpu + data_type : x + - op : squeeze_excitation_block args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims) output : Tensor(out) diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml index 33b00e58ce841..aaf6c4e1445ef 100644 --- a/paddle/phi/api/yaml/ops.yaml +++ b/paddle/phi/api/yaml/ops.yaml @@ -657,15 +657,6 @@ func : depthwise_conv2d backward : depthwise_conv2d_grad -- op : dequantize_xpu - args : (Tensor x, DataType out_dtype, float scale = 1.0f) - output : Tensor(y) - infer_meta : - func : DeQuantizeXPUInferMeta - kernel : - func : dequantize_xpu - data_type: x - - op : det args : (Tensor x) output : Tensor @@ -2048,15 +2039,6 @@ func : qr backward : qr_grad -- op : quantize_xpu - args : (Tensor x, DataType out_dtype, float scale = 1.0f) - output : Tensor(y) - infer_meta : - func : QuantizeXPUInferMeta - kernel : - func : quantize_xpu - data_type : x - - op : real args : (Tensor x) output : Tensor (out)