From 26e125de993545807af63c56a307a959eab9c6df Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 19 Sep 2023 17:07:35 +0800
Subject: [PATCH 01/15] support int8

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../framework/ir/auto_mixed_precision_pass.cc |  21 +-
 .../ir/delete_quant_dequant_linear_op_pass.cc |   6 +-
 .../delete_weight_dequant_linear_op_pass.cc   |  36 +-
 .../auto_trans_quantize_op_precision_pass.cc  | 130 +++
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 950 ++++++++++++++----
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 676 ++++++++++---
 .../framework/ir/xpu/link_xpu_op_max_pass.cc  |  17 +-
 paddle/fluid/framework/ir/xpu/pass_utils.cc   | 104 ++
 paddle/fluid/framework/ir/xpu/pass_utils.h    |  32 +
 paddle/fluid/framework/ir/xpu/quant_utils.cc  | 146 +++
 paddle/fluid/framework/ir/xpu/quant_utils.h   |  28 +
 .../inference/analysis/passes/CMakeLists.txt  |   9 +-
 .../passes/convert_to_mixed_precision.cc      |  12 +
 .../inference/api/paddle_pass_builder.cc      |   3 +
 paddle/phi/api/yaml/fused_ops.yaml            |   8 +-
 paddle/phi/backends/xpu/xpu2_op_list.cc       |   8 +-
 paddle/phi/infermeta/fusion.cc                |   4 +
 paddle/phi/infermeta/fusion.h                 |   4 +
 .../kernels/fusion/xpu/conv2d_xpu_kernel.cc   | 427 +++++++-
 .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc   | 137 ++-
 21 files changed, 2390 insertions(+), 370 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e67dfa5adf910..e9a8e4cc22cac 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -247,6 +247,8 @@ if(WITH_XPU)
   # pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
   pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu
                DEPS ${XPU_PASS_DEPS})
   pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 698de5d90c256..497dcae8395d5 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -734,6 +734,11 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert(
 }
 
 void AutoMixedPrecisionPass::SetVarPrecision() const {
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(scope,
+                          platform::errors::PreconditionNotMet(
+                              "During the auto_mixed_precision_pass, the scope "
+                              "should not be null."));
   for (const auto& nodes : all_op_nodes_) {
     for (auto* op_node : nodes) {
       if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) {
@@ -750,7 +755,21 @@ void AutoMixedPrecisionPass::SetVarPrecision() const {
           if (!IsFP32AndFP64(real_in_var_node->Var()->GetDataType())) continue;
           if (!VarNodeHasDtype(real_in_var_node)) continue;
           if (InputVarsNotConvert(op_node, in_var_name)) continue;
-
+          // Judge the real tensor is same to variable, Paddle-Slim weight use
+          // fp32 variable to save int8 tensor.
+          if (real_in_var_node->Var()->Persistable()) {
+            auto* tensor = scope->Var(real_in_var_node->Name())
+                               ->GetMutable<phi::DenseTensor>();
+            if (framework::TransToProtoVarType(tensor->type()) !=
+                real_in_var_node->Var()->GetDataType()) {
+              VLOG(1) << "[AutoMixedPrecisionPass] variable "
+                      << real_in_var_node->Name() << "'s proto data type "
+                      << real_in_var_node->Var()->GetDataType()
+                      << " is different from real dense tensor "
+                      << framework::TransToProtoVarType(tensor->type());
+              continue;
+            }
+          }
           if (real_in_var_node->Var()->Persistable()) {
             real_in_var_node->Var()->SetDataType(
                 framework::TransToProtoVarType(low_precision_));
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index cb6a6e1d5d9dc..42c7f7acdc103 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -137,11 +137,15 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
 
     int nums_any_ops =
         static_cast<int>(dequantize_linear_op_out->outputs.size());
+    int bit_length =
+        PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
     for (int i = 0; i < nums_any_ops; ++i) {
       auto* any_op_desc = dequantize_linear_op_out->outputs[i]->Op();
       any_op_desc->SetAttr("Input_scale_" + quantize_linear_op_x->Var()->Name(),
                            input_scale);
-
+      any_op_desc->SetAttr(
+          "Input_bit_length_" + quantize_linear_op_x->Var()->Name(),
+          bit_length);
       // link x to any_op2
       any_op_desc->RenameInput(dequantize_linear_op_out->Var()->Name(),
                                quantize_linear_op_x->Var()->Name());
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 0b09d1b30f40a..0140fb664b1de 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -35,7 +35,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                     true,
                     platform::errors::InvalidArgument(
                         "Graph must have kParamScopeAttr attribute."));
-
+  VLOG(1) << "Handle delete weight dequant linear op pass ...";
   auto& scope = graph->Get<framework::Scope>(kParamScopeAttr);
   bool is_int8 = false;
 
@@ -44,7 +44,9 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
+      VLOG(1) << "Dequantize linear op Type: " << op->Type();
       if (op->Type() == "dequantize_linear") {
+        VLOG(1) << "Dequantize linear op is come in: " << op->Type();
         Node *weight_var_node, *calcu_op_node, *while_op_node;
         Node *dequantized_weight_var_node = nullptr, *scale_var_node = nullptr;
         // 1. Judge whether for dequant weight and find
@@ -110,6 +112,8 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                         weight_scale_tensor->dtype()));
                   }
 
+                  int bit_length =
+                      PADDLE_GET_CONST(int, op->GetAttr("bit_length"));
                   int quant_axis =
                       PADDLE_GET_CONST(int, op->GetAttr("quant_axis"));
                   if (quant_axis == -1) {  // per_layer quant_dequant: all OP
@@ -124,14 +128,36 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
 
                     calcu_op_desc->SetAttr("weight_scale", weight_scale[0]);
                   } else {
-                    PADDLE_THROW(platform::errors::Unimplemented(
-                        "Delete Weight Dequant Linear Op Pass is not supported "
-                        "for "
-                        "per-channel quantization"));
+                    std::vector<int64_t> weights_shape =
+                        weight_var_node->Var()->GetShape();
+                    quant_axis = quant_axis >= 0
+                                     ? quant_axis
+                                     : quant_axis + weights_shape.size();
+                    PADDLE_ENFORCE_EQ(
+                        weight_scale_nums,
+                        weights_shape[quant_axis],
+                        platform::errors::InvalidArgument(
+                            "When quant_axis != -1, it means using per_channel "
+                            "dequantization. In this situation, the number of "
+                            "weight_scale should be equal with "
+                            "weights_shape[quant_axis=%d]=%ld , but received "
+                            "%d.",
+                            quant_axis,
+                            weights_shape[quant_axis],
+                            weight_scale_nums));
+                    calcu_op_desc->SetAttr("weight_scale", weight_scale);
                   }
+                  calcu_op_desc->SetAttr("weight_quant_axis", quant_axis);
+                  calcu_op_desc->SetAttr("weight_bit_length", bit_length);
+                  calcu_op_desc->SetAttr("enable_int8", true);
+                  VLOG(1) << "dequantized_weight_var_node->Var()->Name():"
+                          << dequantized_weight_var_node->Var()->Name();
+                  VLOG(1) << "weight_var_node->Var()->Name(): "
+                          << weight_var_node->Var()->Name();
                   calcu_op_desc->RenameInput(
                       dequantized_weight_var_node->Var()->Name(),
                       weight_var_node->Var()->Name());
+                  calcu_op_desc->Flush();
                 }
               }
             }
diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
new file mode 100644
index 0000000000000..c8b4b7c040f7e
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
@@ -0,0 +1,130 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AutoTransQuantizeOpPrecisionPass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+  void FirstRound(ir::Graph* graph) const;
+
+  const std::string name_scope_{"auto_trans_quantize_op_precision_pass"};
+  const std::unordered_set<std::string> support_fusion_quant_op_type_{
+      "conv2d_xpu"};
+};
+
+static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph,
+                                              Node* op_node,
+                                              const std::string& arg_name) {
+  CHECK_EQ(op_node->IsOp(), true);
+  auto* op_desc = op_node->Op();
+  auto out_var_nodes = op_desc->Output(arg_name);
+  CHECK_EQ(out_var_nodes.size(), 1UL);
+  auto out_var_name = out_var_nodes[0];
+  auto out_var_node = FindNodeWithName(graph, out_var_name);
+  return out_var_node;
+}
+
+void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const {
+  auto graph_size = graph->SubGraphsSize();
+  VLOG(1) << "There is " << graph_size << " subgraphs need to be handle.";
+  for (size_t i = 0; i < graph_size; i++) {
+    auto subgraph = graph->GetSubGraph(i);
+    VLOG(1) << "Handling the subgraph id: " << i;
+    for (auto* op_node : TopologySortOperations(*subgraph)) {
+      auto op_type = op_node->Op()->Type();
+      if (support_fusion_quant_op_type_.find(op_type) !=
+          support_fusion_quant_op_type_.end()) {
+        bool enable_int8 = op_node->Op()->GetAttrIfExists<bool>("enable_int8");
+        int out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
+        if (enable_int8) {
+          if (op_type == "conv2d_xpu") {
+            auto* out_var_node =
+                GetOpOutVarNodeByArgsName(subgraph, op_node, "out");
+            PADDLE_ENFORCE_NOT_NULL(
+                out_var_node,
+                platform::errors::InvalidArgument(
+                    "out_var_node in graph cannot be nullptr."));
+            bool is_int8_out = true;
+            for (auto* next_op_node : out_var_node->outputs) {
+              auto next_op_type = next_op_node->Op()->Type();
+              bool is_next_op_support_int8 =
+                  next_op_node->Op()->GetAttrIfExists<bool>("enable_int8") &&
+                  ((support_fusion_quant_op_type_.find(next_op_type) !=
+                    support_fusion_quant_op_type_.end()));
+              if (!is_next_op_support_int8) {
+                is_int8_out = false;
+                break;
+              }
+            }
+            if (is_int8_out) {
+              op_node->Op()->SetAttr(
+                  "out_dtype",
+                  static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+              out_var_node->Var()->SetDataType(
+                  proto::VarType::Type::VarType_Type_INT8);
+              VLOG(1) << "The out var node " << out_var_node->Name()
+                      << " is INT8";
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void AutoTransQuantizeOpPrecisionPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  VLOG(1) << "AutoTransQuantizeOpPrecisionPass handling start ...";
+  FirstRound(graph);
+  VLOG(1) << "AutoTransQuantizeOpPrecisionPass handleing end.";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(auto_trans_quantize_op_precision_pass,
+              paddle::framework::ir::AutoTransQuantizeOpPrecisionPass);
+
+REGISTER_PASS_CAPABILITY(auto_trans_quantize_op_precision_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc_xpu", 0)
+            .EQ("conv2d_xpu", 0));
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 502c275a419d3..19e006d535409 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <map>
 #include <string>
 
 #include "glog/logging.h"
@@ -355,6 +356,49 @@ class Conv2dXPUFusePass : public FusePassBase {
                 bool with_branch_x,
                 bool with_branch_y) const;
 
+  Node* GetNodeFromNodesMap(
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::string pattern_node_name,
+      std::string node_name) const;
+
+  void CreateFusionWeightsAndBias(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool with_conv_bias,
+      bool with_bn,
+      bool with_scale,
+      bool enable_int8) const;
+
+  void CreateFusionInputs(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool enable_int8) const;
+
+  void CreateFusionBranch(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool enable_int8) const;
+
+  void CreateFusionOutputs(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool enable_int8,
+      std::string act_type) const;
+
+  const std::unordered_set<std::string> support_quant_op_type_{"conv2d",
+                                                               "conv2d_xpu"};
   const std::string name_scope_{"conv2d_xpu_fuse_pass"};
 };
 
@@ -401,6 +445,535 @@ void Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_subgraph_count);
 }
 
+Node* Conv2dXPUFusePass::GetNodeFromNodesMap(
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::string pattern_node_name,
+    std::string node_name) const {
+  auto iter = nodes_map.find(pattern_node_name);
+  PADDLE_ENFORCE_EQ(
+      iter != nodes_map.end(),
+      true,
+      platform::errors::InvalidArgument("nodes_map[%s] not found in nodes_map",
+                                        pattern_node_name.c_str()));
+  auto node_map = iter->second;
+  auto node_iter = node_map.find(node_name);
+  PADDLE_ENFORCE_EQ(node_iter != node_map.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "nodes_map[%s][%s] not found in nodes_map",
+                        pattern_node_name.c_str(),
+                        node_name.c_str()));
+  return node_iter->second;
+}
+
+void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool with_conv_bias,
+    bool with_bn,
+    bool with_scale,
+    bool enable_int8) const {
+  // Get Node
+  auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
+  PADDLE_ENFORCE_EQ(
+      conv != nullptr,
+      true,
+      platform::errors::InvalidArgument("conv node ptr can not be null"));
+  auto* conv_filter = GetNodeFromNodesMap(nodes_map, "conv", "conv_filter");
+  PADDLE_ENFORCE_EQ(conv_filter != nullptr,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "conv_filter node ptr can not be null"));
+
+  // transfilter fp16 --> fp32
+  auto* filter_t =
+      scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
+  auto filter_len = filter_t->numel();
+  auto filter_dtype = filter_t->dtype();
+  if (filter_dtype == phi::DataType::FLOAT16) {
+    CastToFp32(filter_t, nullptr);
+  }
+
+  // Get Weight scale in int8 scene
+  std::vector<float> weight_scale =
+      conv->Op()->GetAttrIfExists<std::vector<float>>("weight_scale");
+  // Create fusion_bias_node
+  auto filter_dims = filter_t->dims();
+  bool has_bias = with_bn || with_conv_bias;
+  Node* fusion_bias_node = nullptr;
+  if (with_conv_bias) {
+    auto* ew_bias_add_y =
+        GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_y");
+    PADDLE_ENFORCE_EQ(ew_bias_add_y != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_bias_add_y node ptr can not be null"));
+    auto* ew_bias_add_y_t =
+        scope->FindVar(ew_bias_add_y->Name())->GetMutable<phi::DenseTensor>();
+    auto ew_bias_add_y_dims = ew_bias_add_y_t->dims();
+    PADDLE_ENFORCE_EQ(filter_dims[0],
+                      ew_bias_add_y_dims[0],
+                      platform::errors::InvalidArgument(
+                          "the shape[%d] of elewise bias tensor "
+                          "must equal out_channel[%d] of conv",
+                          ew_bias_add_y_dims[0],
+                          filter_dims[0]));
+    PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node);
+  }
+
+  if (with_bn) {
+    auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
+    PADDLE_ENFORCE_EQ(
+        bn != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn node ptr can not be null"));
+    auto* bn_bias = GetNodeFromNodesMap(nodes_map, "bn", "bn_bias");
+    PADDLE_ENFORCE_EQ(
+        bn_bias != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_bias node ptr can not be null"));
+    auto* bn_scale = GetNodeFromNodesMap(nodes_map, "bn", "bn_scale");
+    PADDLE_ENFORCE_EQ(
+        bn_scale != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_scale node ptr can not be null"));
+    auto* bn_var = GetNodeFromNodesMap(nodes_map, "bn", "bn_var");
+    PADDLE_ENFORCE_EQ(
+        bn_var != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_var node ptr can not be null"));
+    auto* bn_mean = GetNodeFromNodesMap(nodes_map, "bn", "bn_mean");
+    PADDLE_ENFORCE_EQ(
+        bn_mean != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_mean node ptr can not be null"));
+
+    auto bn_bias_t =
+        scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0],
+        bn_bias_t->dims()[0],
+        platform::errors::InvalidArgument("the shape[%d] of bn bias tensor "
+                                          "must equal out_channel[%d] of conv",
+                                          bn_bias_t->dims()[0],
+                                          filter_dims[0]));
+    auto bn_scale_t =
+        scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
+    auto bn_mean_t =
+        scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
+    auto bn_var_t = scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
+    float* bn_scale_ptr =
+        bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_bias_ptr =
+        bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_mean_ptr =
+        bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_var_ptr =
+        bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
+    auto mean_len = bn_mean_t->numel();
+    auto filter_stride = filter_len / mean_len;
+    float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
+    if (!with_conv_bias) {  // prev node is conv
+      PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
+    }
+
+    auto fusion_bias_t =
+        scope->Var(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
+    float* fusion_bias_ptr =
+        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    // recompute bias and weights
+    for (int i = 0; i < mean_len; ++i) {
+      bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
+    }
+    // recompute the weights
+    if (!enable_int8) {
+      float* filter_ptr =
+          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; j++) {
+          filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
+        }
+      }
+    } else {
+      int8_t* filter_ptr =
+          filter_t->mutable_data<int8_t>(paddle::platform::CPUPlace());
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(),
+          mean_len,
+          platform::errors::InvalidArgument(
+              "Weight max_scale size must equal batch_norm sacle/mean size."));
+      for (int i = 0; i < mean_len; i++) {
+        weight_scale[i] *= fabs(bn_scale_ptr[i]);
+      }
+      for (int i = 0; i < mean_len; i++) {
+        if (bn_scale_ptr[i] < 0) {
+          for (int j = 0; j < filter_stride; ++j) {
+            filter_ptr[i * filter_stride + j] *= -1;
+          }
+        }
+      }
+    }
+    // recompute bias
+    if (!with_conv_bias) {
+      for (int i = 0; i < mean_len; ++i) {
+        fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i];
+      }
+    } else {
+      for (int i = 0; i < mean_len; ++i) {
+        fusion_bias_ptr[i] =
+            bn_bias_ptr[i] +
+            (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
+      }
+    }
+  }
+
+  // deal with scale op
+  if (with_scale && !enable_int8) {
+    auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale");
+    PADDLE_ENFORCE_EQ(
+        scale != nullptr,
+        true,
+        platform::errors::InvalidArgument("scale node ptr can not be null"));
+    auto bias_len = filter_dims[0];
+    float scale_val_ = 1.f;
+    float bias_val_ = 0.f;
+    scale_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
+    bias_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("bias"));
+    bool bias_after_scale_ =
+        PADDLE_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale"));
+    // recompute bias as scale op
+    auto fusion_bias_t =
+        scope->GetVar(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
+    float* fusion_bias_ptr =
+        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    for (int i = 0; i < bias_len; ++i) {
+      if (bias_after_scale_) {
+        fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_;
+      } else {
+        fusion_bias_ptr[i] = (fusion_bias_ptr[i] + bias_val_) * scale_val_;
+      }
+    }
+    // recompute weight as scale op
+    float* filter_ptr =
+        filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+    for (int i = 0; i < filter_len; ++i) {
+      filter_ptr[i] *= scale_val_;
+    }
+  }
+
+  (*fusion_nodes_map)["bias"] = fusion_bias_node;
+
+  Node* filter_intx = nullptr;
+  Node* filter_max = nullptr;
+  Node* scale_max = nullptr;
+  if (!enable_int8) {
+    PrepareWeight<float, int16_t>(graph,
+                                  scope,
+                                  block,
+                                  conv_filter,
+                                  &filter_intx,
+                                  &filter_max,
+                                  false,
+                                  weight_scale);
+  } else {
+    PrepareWeight<int8_t, int8_t>(graph,
+                                  scope,
+                                  block,
+                                  conv_filter,
+                                  &filter_intx,
+                                  &filter_max,
+                                  false,
+                                  weight_scale);
+  }
+
+  bool is_per_channel_need_create_scale_max_node =
+      !weight_scale.empty() && !IsPerTensorQuant(weight_scale);
+  if (is_per_channel_need_create_scale_max_node) {
+    phi::DenseTensor ones_weight_max_tensor;
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    int max_ptr_size = weight_scale.empty()
+                           ? phi::backends::xpu::get_xpu_max_ptr_size(-1)
+                           : weight_scale.size();
+    ones_weight_max_tensor.set_type(phi::DataType::FLOAT32);
+    ones_weight_max_tensor.Resize({max_ptr_size});
+    std::vector<float> ones_weight(max_ptr_size, 1.0);
+    memcpy(cpu_ctx->Alloc<float>(&ones_weight_max_tensor),
+           ones_weight.data(),
+           max_ptr_size * sizeof(float));
+
+    std::string scale_max_name = conv_filter->Name() + "_scale_max";
+    VarDesc scale_max_desc(scale_max_name);
+    scale_max_desc.SetPersistable(true);
+    scale_max_desc.SetShape(vectorize(ones_weight_max_tensor.dims()));
+    scale_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    scale_max = graph->CreateVarNode(&scale_max_desc);
+    auto* block_scale_max_desc = block->Var(scale_max_name);
+    block_scale_max_desc->SetPersistable(scale_max_desc.Persistable());
+    block_scale_max_desc->SetShape(scale_max_desc.GetShape());
+    block_scale_max_desc->SetDataType(scale_max_desc.GetDataType());
+    Assign(ones_weight_max_tensor,
+           scope->Var(scale_max_name)->GetMutable<phi::DenseTensor>());
+  }
+
+  (*fusion_nodes_map)["filter"] = filter_intx;
+  if (is_per_channel_need_create_scale_max_node) {
+    (*fusion_nodes_map)["filter_max"] = scale_max;
+    (*fusion_nodes_map)["scale_max"] = filter_max;
+  } else {
+    (*fusion_nodes_map)["filter_max"] = filter_max;
+    (*fusion_nodes_map)["scale_max"] = scale_max;
+  }
+}
+
+void Conv2dXPUFusePass::CreateFusionInputs(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool enable_int8) const {
+  // Get Node
+  auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
+  PADDLE_ENFORCE_EQ(
+      conv != nullptr,
+      true,
+      platform::errors::InvalidArgument("conv node ptr can not be null"));
+  auto* input = GetNodeFromNodesMap(nodes_map, "conv", "input");
+  PADDLE_ENFORCE_EQ(
+      input != nullptr,
+      true,
+      platform::errors::InvalidArgument("conv input node ptr can not be null"));
+  // input max
+  std::string conv_input_max_name = input->Name() + "_input_max";
+  Node* conv2d_xpu_input_max = nullptr;
+  if (enable_int8) {
+    float input_scale =
+        conv->Op()->GetAttrIfExists<float>("Input_scale_" + input->Name());
+    int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+    VarDesc conv_input_max_desc(conv_input_max_name);
+    conv_input_max_desc.SetPersistable(
+        true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
+                // device
+    conv_input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+    conv_input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    conv2d_xpu_input_max = graph->CreateVarNode(&conv_input_max_desc);
+    auto input_max_tensor =
+        scope->Var(conv_input_max_name)->GetMutable<phi::DenseTensor>();
+    input_max_tensor->set_type(phi::DataType::FLOAT32);
+    input_max_tensor->Resize({max_ptr_size});
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    std::vector<float> input_scales(max_ptr_size, input_scale);
+    memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
+           input_scales.data(),
+           max_ptr_size * sizeof(float));
+  }
+  (*fusion_nodes_map)["x"] = input;
+  (*fusion_nodes_map)["x_max"] = conv2d_xpu_input_max;
+}
+
+void Conv2dXPUFusePass::CreateFusionBranch(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool enable_int8) const {
+  // Get Node
+  auto* ew_branch_add =
+      GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add");
+  if (ew_branch_add) {
+    auto* ew_branch_add_in =
+        GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_in");
+    PADDLE_ENFORCE_EQ(ew_branch_add_in != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_branch_add_in node ptr can not be null"));
+    (*fusion_nodes_map)["branch"] = ew_branch_add_in;
+    // ew_branch_add_max
+    std::string ew_branch_add_max_name =
+        ew_branch_add_in->Name() + "branch_max";
+    Node* ew_branch_add_max = FindNodeWithName(graph, ew_branch_add_max_name);
+    if (enable_int8 && !ew_branch_add_max) {
+      int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+      VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name);
+      ew_branch_add_in_max_desc.SetPersistable(
+          true);  // Need depends on ir_params_sync_among_devices_pass copy to
+                  // xpu device
+      ew_branch_add_in_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+      ew_branch_add_in_max_desc.SetDataType(
+          proto::VarType::Type::VarType_Type_FP32);
+      ew_branch_add_max = graph->CreateVarNode(&ew_branch_add_in_max_desc);
+      float ew_branch_add_scale = ew_branch_add->Op()->GetAttrIfExists<float>(
+          "Input_scale_" + ew_branch_add_in->Name());
+      auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
+      PADDLE_ENFORCE_EQ(
+          conv != nullptr,
+          true,
+          platform::errors::InvalidArgument("conv node ptr can not be null"));
+      conv->Op()->SetAttr("Input_scale_" + ew_branch_add_in->Name(),
+                          ew_branch_add_scale);
+      auto ew_branch_add_max_tensor =
+          scope->Var(ew_branch_add_max_name)->GetMutable<phi::DenseTensor>();
+      ew_branch_add_max_tensor->set_type(phi::DataType::FLOAT32);
+      ew_branch_add_max_tensor->Resize({max_ptr_size});
+      auto* cpu_ctx = static_cast<phi::CPUContext*>(
+          platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+      std::vector<float> ew_branch_add_scales(max_ptr_size,
+                                              ew_branch_add_scale);
+      memcpy(cpu_ctx->Alloc<float>(ew_branch_add_max_tensor),
+             ew_branch_add_scales.data(),
+             max_ptr_size * sizeof(float));
+    }
+    (*fusion_nodes_map)["branch_max"] = ew_branch_add_max;
+  }
+}
+
+void Conv2dXPUFusePass::CreateFusionOutputs(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool enable_int8,
+    std::string act_type) const {
+  auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
+  PADDLE_ENFORCE_EQ(
+      conv != nullptr,
+      true,
+      platform::errors::InvalidArgument("conv node ptr can not be null"));
+  // output && output max
+  std::string conv2d_xpu_out_name;
+  Node* conv2d_out_op_node = nullptr;
+  Node* conv2d_out_var_node = nullptr;
+
+  auto* ew_branch_add =
+      GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add");
+  auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
+  auto* ew_bias_add =
+      GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add");
+  if (!act_type.empty()) {
+    auto* act_out = GetNodeFromNodesMap(nodes_map, "act", "act_out");
+    PADDLE_ENFORCE_EQ(
+        act_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("act_out node ptr can not be null"));
+    conv2d_xpu_out_name = act_out->Name();
+    conv2d_out_var_node = act_out;
+    auto* act = GetNodeFromNodesMap(nodes_map, "act", "act");
+    PADDLE_ENFORCE_EQ(
+        act != nullptr,
+        true,
+        platform::errors::InvalidArgument("act node ptr can not be null"));
+    conv2d_out_op_node = act;
+  } else if (ew_branch_add) {
+    auto* ew_branch_add_out =
+        GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_out");
+    PADDLE_ENFORCE_EQ(ew_branch_add_out != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_branch_add_out node ptr can not be null"));
+    conv2d_xpu_out_name = ew_branch_add_out->Name();
+    conv2d_out_var_node = ew_branch_add_out;
+    PADDLE_ENFORCE_EQ(ew_branch_add != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_branch_add node ptr can not be null"));
+    conv2d_out_op_node = ew_branch_add;
+  } else if (bn) {
+    auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out");
+    PADDLE_ENFORCE_EQ(
+        bn_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_out node ptr can not be null"));
+    conv2d_xpu_out_name = bn_out->Name();
+    conv2d_out_var_node = bn_out;
+    conv2d_out_op_node = bn;
+  } else if (ew_bias_add) {
+    auto* ew_bias_add_out =
+        GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out");
+    PADDLE_ENFORCE_EQ(ew_bias_add_out != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_bias_add_out node ptr can not be null"));
+    conv2d_xpu_out_name = ew_bias_add_out->Name();
+    conv2d_out_var_node = ew_bias_add_out;
+    conv2d_out_op_node = ew_bias_add;
+  } else {
+    auto* conv_out = GetNodeFromNodesMap(nodes_map, "conv", "conv_out");
+    PADDLE_ENFORCE_EQ(
+        conv_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("conv_out node ptr can not be null"));
+    conv2d_xpu_out_name = conv_out->Name();
+    conv2d_out_var_node = conv_out;
+    auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
+    PADDLE_ENFORCE_EQ(
+        conv != nullptr,
+        true,
+        platform::errors::InvalidArgument("conv node ptr can not be null"));
+    conv2d_out_op_node = conv;
+  }
+  (*fusion_nodes_map)["out"] = conv2d_out_var_node;
+
+  // Create out max in
+  if (enable_int8) {
+    std::string conv_out_max_in_name = conv2d_xpu_out_name + "_max_in";
+    int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+    VarDesc conv_out_max_in_desc(conv_out_max_in_name);
+    conv_out_max_in_desc.SetPersistable(true);
+    conv_out_max_in_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+    conv_out_max_in_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    Node* conv2d_xpu_out_max_in = graph->CreateVarNode(&conv_out_max_in_desc);
+    auto* block_out_max_in_desc = block->Var(conv_out_max_in_name);
+    block_out_max_in_desc->SetPersistable(conv_out_max_in_desc.Persistable());
+    block_out_max_in_desc->SetShape(conv_out_max_in_desc.GetShape());
+    block_out_max_in_desc->SetDataType(conv_out_max_in_desc.GetDataType());
+
+    auto GetOutputScale = [&](Node* var_node, std::string name) -> float {
+      int nums_any_ops = var_node->outputs.size();
+      for (size_t i = 0; i < nums_any_ops; ++i) {
+        auto* any_op_desc = conv2d_out_var_node->outputs[i]->Op();
+        VLOG(1) << "any_op_desc: " << any_op_desc->Type();
+        if (any_op_desc->HasAttr("Input_scale_" + name)) {
+          VLOG(1) << "find it: "
+                  << "Input_scale_" + name;
+          return any_op_desc->GetAttrIfExists<float>("Input_scale_" + name);
+        }
+      }
+      return 0;
+    };
+    float output_scale =
+        GetOutputScale(conv2d_out_var_node, conv2d_xpu_out_name);
+    conv->Op()->SetAttr("Input_scale_" + conv2d_xpu_out_name, output_scale);
+    VLOG(1) << "conv2d_xpu_out_name:" << conv2d_xpu_out_name
+            << " output_scale: " << output_scale
+            << "conv2d_out_var_node name:" << conv2d_out_var_node->Name();
+    phi::DenseTensor out_max_in_cpu_tensor;
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    out_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32);
+    out_max_in_cpu_tensor.Resize({max_ptr_size});
+    std::vector<float> output_scales(max_ptr_size, output_scale);
+    memcpy(cpu_ctx->Alloc<float>(&out_max_in_cpu_tensor),
+           output_scales.data(),
+           max_ptr_size * sizeof(float));
+    Assign(out_max_in_cpu_tensor,
+           scope->Var(conv_out_max_in_name)->GetMutable<phi::DenseTensor>());
+    (*fusion_nodes_map)["out_max_in"] = conv2d_xpu_out_max_in;
+  }
+
+  // Create out max
+  std::string conv_out_max_name = conv2d_xpu_out_name + "_max";
+  VarDesc conv_out_max_desc(conv_out_max_name);
+  Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv_out_max_desc);
+  (*fusion_nodes_map)["out_max"] = conv2d_xpu_out_max;
+}
+
 int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                  const std::string& conv_type,
                                  const std::string& act_type,
@@ -419,18 +992,22 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                      with_scale,
                                      with_branch_x,
                                      with_branch_y);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
+
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle Conv2dXPUFusePass fuse";
-    /* declare operator node's name */
+    std::map<std::string, std::map<std::string, Node*>> nodes_map;
     GET_IR_NODE(conv);
     GET_IR_NODE(ew_bias_add);
     GET_IR_NODE(bn);
     GET_IR_NODE(scale);
     GET_IR_NODE(ew_branch_add);
     GET_IR_NODE(act);
-    /* declare variable node's name*/
+    /* Get variable node's name*/
     GET_IR_NODE(input);
     GET_IR_NODE(conv_filter);
     GET_IR_NODE(conv_out);
@@ -449,167 +1026,140 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     GET_IR_NODE(ew_branch_add_in);
     GET_IR_NODE(ew_branch_add_out);
     GET_IR_NODE(act_out);
+
+    nodes_map.insert({"conv",
+                      {{"conv", conv},
+                       {"conv_filter", conv_filter},
+                       {"input", input},
+                       {"conv_out", conv_out}}});
+    nodes_map.insert({"ew_bias_add",
+                      {{"ew_bias_add", ew_bias_add},
+                       {"ew_bias_add_y", ew_bias_add_y},
+                       {"ew_bias_add_out", ew_bias_add_out}}});
+    nodes_map.insert({"bn",
+                      {{"bn", bn},
+                       {"bn_bias", bn_bias},
+                       {"bn_mean", bn_mean},
+                       {"bn_scale", bn_scale},
+                       {"bn_var", bn_var},
+                       {"bn_out", bn_out},
+                       {"bn_var_out", bn_var_out},
+                       {"bn_mean_out", bn_mean_out},
+                       {"bn_saved_var", bn_saved_var},
+                       {"bn_saved_mean", bn_saved_mean}}});
+    nodes_map.insert({"scale", {{"scale", scale}, {"scale_out", scale_out}}});
+    nodes_map.insert({"ew_branch_add",
+                      {{"ew_branch_add", ew_branch_add},
+                       {"ew_branch_add_in", ew_branch_add_in},
+                       {"ew_branch_add_out", ew_branch_add_out}}});
+    nodes_map.insert({"act", {{"act", act}, {"act_out", act_out}}});
+
+    std::map<std::string, Node*> fusion_nodes_map{{"x", nullptr},
+                                                  {"x_max", nullptr},
+                                                  {"filter", nullptr},
+                                                  {"filter_max", nullptr},
+                                                  {"bias", nullptr},
+                                                  {"branch", nullptr},
+                                                  {"branch_max", nullptr},
+                                                  {"scale_max", nullptr},
+                                                  {"out_max_in", nullptr},
+                                                  {"out", nullptr},
+                                                  {"out_max", nullptr}};
+
+    bool enable_int8 = conv->Op()->GetAttrIfExists<bool>("enable_int8");
+    std::string op_precision_str = enable_int8 ? "int8" : "fp32";
+    VLOG(4) << "Conv2d fusion fuse pass is running on " << op_precision_str
+            << " precision!";
     auto* block = conv->Op()->Block();
-    auto* scope = param_scope();
-    PADDLE_ENFORCE_NOT_NULL(
-        scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-    // recompute bias and weight for conv2d_xpu op
-    auto* filter_t =
-        scope->FindVar(conv_filter->Name())->GetMutable<phi::DenseTensor>();
-    // conv_filter fp16 --> fp32
-    auto filter_len = filter_t->numel();
-    auto filter_dtype = filter_t->dtype();
-    int out_dtype = proto::VarType::Type::VarType_Type_FP32;
-    if (filter_dtype == phi::DataType::FLOAT16) {
-      out_dtype = proto::VarType::Type::VarType_Type_FP16;
-      CastToFp32(filter_t, nullptr);
-    }
-
-    auto filter_dims = filter_t->dims();
-    bool has_bias = with_bn || with_conv_bias;
-    // Create conv_fusion_bias (conv bias) variable
-    Node* fusion_bias_node = nullptr;
-    if (has_bias) {
-      if (with_conv_bias) {
-        auto* ew_bias_add_y_t = scope->FindVar(ew_bias_add_y->Name())
-                                    ->GetMutable<phi::DenseTensor>();
-        auto ew_bias_add_y_dims = ew_bias_add_y_t->dims();
-        PADDLE_ENFORCE_EQ(filter_dims[0],
-                          ew_bias_add_y_dims[0],
-                          platform::errors::InvalidArgument(
-                              "the shape[%d] of elewise bias tensor "
-                              "must equal out_channel[%d] of conv",
-                              ew_bias_add_y_dims[0],
-                              filter_dims[0]));
-        PrepareBias(graph, scope, block, ew_bias_add_y, &fusion_bias_node);
-      }
-      if (with_bn) {
-        auto bn_bias_t =
-            scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
-        PADDLE_ENFORCE_EQ(filter_dims[0],
-                          bn_bias_t->dims()[0],
-                          platform::errors::InvalidArgument(
-                              "the shape[%d] of bn bias tensor "
-                              "must equal out_channel[%d] of conv",
-                              bn_bias_t->dims()[0],
-                              filter_dims[0]));
-        auto bn_scale_t =
-            scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
-        auto bn_mean_t =
-            scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
-        auto bn_var_t =
-            scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
-        float* filter_ptr =
-            filter_t->mutable_data<float>(paddle::platform::CPUPlace());
-        float* bn_scale_ptr =
-            bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
-        float* bn_bias_ptr =
-            bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
-        float* bn_mean_ptr =
-            bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
-        float* bn_var_ptr =
-            bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
-        auto mean_len = bn_mean_t->numel();
-        auto filter_stride = filter_len / mean_len;
-        float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
-        if (!with_conv_bias) {  // prev node is conv
-          PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
-        }
-        auto fusion_bias_t = scope->Var(fusion_bias_node->Name())
-                                 ->GetMutable<phi::DenseTensor>();
-        float* fusion_bias_ptr =
-            fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
-        // recompute bias and weights
-        if (!with_conv_bias) {  // prev node is conv
-          for (int i = 0; i < mean_len; ++i) {
-            bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
-            fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i];
-            for (int j = 0; j < filter_stride; j++) {
-              filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
-            }
-          }
-        } else {
-          for (int i = 0; i < mean_len; ++i) {
-            bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
-            fusion_bias_ptr[i] =
-                bn_bias_ptr[i] +
-                (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
-            for (int j = 0; j < filter_stride; j++) {
-              filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
-            }
-          }
-        }
-      }
-    }
-    // deal with scale op
-    if (with_scale) {
-      auto bias_len = filter_dims[0];
-      float scale_val_ = 1.f;
-      float bias_val_ = 0.f;
-      scale_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("scale"));
-      bias_val_ = PADDLE_GET_CONST(float, scale->Op()->GetAttr("bias"));
-      bool bias_after_scale_ =
-          PADDLE_GET_CONST(bool, scale->Op()->GetAttr("bias_after_scale"));
-      // recompute bias as scale op
-      auto fusion_bias_t = scope->GetVar(fusion_bias_node->Name())
-                               ->GetMutable<phi::DenseTensor>();
-      float* fusion_bias_ptr =
-          fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
-      for (int i = 0; i < bias_len; ++i) {
-        if (bias_after_scale_) {
-          fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_;
-        } else {
-          fusion_bias_ptr[i] = (fusion_bias_ptr[i] + bias_val_) * scale_val_;
-        }
-      }
-      // recompute weight as scale op
-      float* filter_ptr =
-          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
-      for (int i = 0; i < filter_len; ++i) {
-        filter_ptr[i] *= scale_val_;
-      }
-    }
-    // filter max
-    Node* filter_int16 = nullptr;
-    Node* filter_max = nullptr;
-    PrepareWeight<int16_t>(
-        graph, scope, block, conv_filter, &filter_int16, &filter_max, false);
-    // output && output max
-    std::string conv2d_xpu_out_name;
-    if (!act_type.empty()) {
-      conv2d_xpu_out_name = act_out->Name();
-    } else if (ew_branch_add) {
-      conv2d_xpu_out_name = ew_branch_add_out->Name();
-    } else if (scale) {
-      conv2d_xpu_out_name = scale_out->Name();
-    } else if (bn) {
-      conv2d_xpu_out_name = bn_out->Name();
-    } else if (ew_bias_add) {
-      conv2d_xpu_out_name = ew_bias_add_out->Name();
-    } else {
-      conv2d_xpu_out_name = conv_out->Name();
-    }
-    std::string conv2d_xpu_out_max_name = conv2d_xpu_out_name + "_max";
-    VarDesc conv2d_xpu_out_max_desc(conv2d_xpu_out_max_name);
-    Node* conv2d_xpu_out_max = graph->CreateVarNode(&conv2d_xpu_out_max_desc);
+    CreateFusionWeightsAndBias(graph,
+                               scope,
+                               block,
+                               nodes_map,
+                               &fusion_nodes_map,
+                               with_conv_bias,
+                               with_bn,
+                               with_scale,
+                               enable_int8);
+    VLOG(1) << "CreateFusionWeightsAndBias success!";
+    CreateFusionInputs(
+        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
+    VLOG(1) << "CreateFusionInputs success!";
+    CreateFusionBranch(
+        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
+    VLOG(1) << "CreateFusionBranch success!";
+    CreateFusionOutputs(graph,
+                        scope,
+                        block,
+                        nodes_map,
+                        &fusion_nodes_map,
+                        enable_int8,
+                        act_type);
+    VLOG(1) << "CreateFusionOutputs success!";
+    // int out_dtype = PADDLE_GET_CONST(int, conv->Op()->GetAttr("out_dtype"));
+    // if (out_dtype == proto::VarType::Type::VarType_Type_INT8) {
+    //   fusion_nodes_map["out"]->Var()->SetDataType(
+    //       proto::VarType::Type::VarType_Type_INT8);
+    //   if (fusion_nodes_map["branch"]) {
+    //     fusion_nodes_map["branch"]->Var()->SetDataType(
+    //         proto::VarType::Type::VarType_Type_INT8);
+    //   }
+    // }
     // Generate conv2d_xpu op
     framework::OpDesc conv2d_xpu_op_desc(block);
+    for (auto [first, second] : fusion_nodes_map) {
+      VLOG(1) << "first: " << first << " second: " << second;
+      if (first == "x" || first == "out" || first == "out_max" ||
+          first == "branch")
+        continue;
+      if (second != nullptr) {
+        auto* temp_tensor =
+            scope->FindVar(second->Name())->GetMutable<phi::DenseTensor>();
+        VLOG(1) << *temp_tensor;
+      }
+    }
     // set input&output var
     conv2d_xpu_op_desc.SetType("conv2d_xpu");
-    conv2d_xpu_op_desc.SetInput("x", {input->Name()});
-    conv2d_xpu_op_desc.SetInput("filter", {filter_int16->Name()});
-    conv2d_xpu_op_desc.SetInput("filter_max", {filter_max->Name()});
-    conv2d_xpu_op_desc.SetOutput("out", {conv2d_xpu_out_name});
-    conv2d_xpu_op_desc.SetOutput("out_max", {conv2d_xpu_out_max_name});
-    // set fusion_bias input node
-    if (has_bias) {
-      conv2d_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
+    conv2d_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()});
+    if (fusion_nodes_map["x_max"]) {
+      conv2d_xpu_op_desc.SetInput("x_max", {fusion_nodes_map["x_max"]->Name()});
+    }
+    conv2d_xpu_op_desc.SetInput("filter", {fusion_nodes_map["filter"]->Name()});
+    conv2d_xpu_op_desc.SetInput("filter_max",
+                                {fusion_nodes_map["filter_max"]->Name()});
+    if (fusion_nodes_map["scale_max"]) {
+      conv2d_xpu_op_desc.SetInput("scale_max",
+                                  {fusion_nodes_map["scale_max"]->Name()});
+    }
+    if (fusion_nodes_map["out_max_in"]) {
+      conv2d_xpu_op_desc.SetInput("out_max_in",
+                                  {fusion_nodes_map["out_max_in"]->Name()});
+    }
+    conv2d_xpu_op_desc.SetOutput("out", {fusion_nodes_map["out"]->Name()});
+    conv2d_xpu_op_desc.SetOutput("out_max",
+                                 {fusion_nodes_map["out_max"]->Name()});
+    if (with_conv_bias || with_bn) {
+      PADDLE_ENFORCE_EQ(
+          fusion_nodes_map["bias"] != nullptr,
+          true,
+          platform::errors::InvalidArgument(
+              "fusion_nodes_map['bias'] node ptr can not be null"));
+      conv2d_xpu_op_desc.SetInput("bias", {fusion_nodes_map["bias"]->Name()});
     }
     // set ew_branch_add input node
     if (ew_branch_add != nullptr) {
-      conv2d_xpu_op_desc.SetInput("branch", {ew_branch_add_in->Name()});
+      PADDLE_ENFORCE_EQ(
+          fusion_nodes_map["branch"] != nullptr,
+          true,
+          platform::errors::InvalidArgument(
+              "fusion_nodes_map['branch'] node ptr can not be null"));
+      conv2d_xpu_op_desc.SetInput("branch",
+                                  {fusion_nodes_map["branch"]->Name()});
+      if (fusion_nodes_map["branch_max"]) {
+        conv2d_xpu_op_desc.SetInput("branch_max",
+                                    {fusion_nodes_map["branch_max"]->Name()});
+      }
     }
+    VLOG(1) << "creat conv2d_xpu_op_desc success!";
     // set attrs of conv2d_xpu
     float act_param_ = 0.0f;
     if (!act_type.empty()) {
@@ -646,57 +1196,73 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
         "strides",
         PADDLE_GET_CONST(std::vector<int>, conv->Op()->GetAttr("strides")));
     conv2d_xpu_op_desc.SetAttr("paddings", conv_paddings);
-    conv2d_xpu_op_desc.SetAttr("out_dtype", out_dtype);
+    // out_dtype is same to input precision
+    conv2d_xpu_op_desc.SetAttr("out_dtype",
+                               fusion_nodes_map["x"]->Var()->GetDataType());
+    conv2d_xpu_op_desc.SetAttr(
+        "enable_int8", conv->Op()->GetAttrIfExists<bool>("enable_int8"));
+    if (enable_int8) {
+      conv2d_xpu_op_desc.SetAttr(
+          "Input_scale_" + fusion_nodes_map["out"]->Name(),
+          conv->Op()->GetAttrIfExists<float>("Input_scale_" +
+                                             fusion_nodes_map["out"]->Name()));
+      conv2d_xpu_op_desc.SetAttr(
+          "Input_scale_" + fusion_nodes_map["x"]->Name(),
+          conv->Op()->GetAttrIfExists<float>("Input_scale_" +
+                                             fusion_nodes_map["x"]->Name()));
+      if (fusion_nodes_map["branch"]) {
+        conv2d_xpu_op_desc.SetAttr(
+            "Input_scale_" + fusion_nodes_map["branch"]->Name(),
+            conv->Op()->GetAttrIfExists<float>(
+                "Input_scale_" + fusion_nodes_map["branch"]->Name()));
+      }
+    }
 
+    VLOG(1) << "Set attr success!";
+    // Link node
     auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
-    IR_NODE_LINK_TO(input, conv2d_xpu);
-    IR_NODE_LINK_TO(filter_int16, conv2d_xpu);
-    IR_NODE_LINK_TO(filter_max, conv2d_xpu);
-    if (ew_bias_add || bn) {
-      SAFE_IR_NODE_LINK_TO(fusion_bias_node, conv2d_xpu);
-    }
-    if (ew_branch_add_in) {
-      IR_NODE_LINK_TO(ew_branch_add_in, conv2d_xpu);
-    }
-    if (act_out) {
-      IR_NODE_LINK_TO(conv2d_xpu, act_out);
-    } else if (ew_branch_add_out) {
-      IR_NODE_LINK_TO(conv2d_xpu, ew_branch_add_out);
-    } else if (scale_out) {
-      IR_NODE_LINK_TO(conv2d_xpu, scale_out);
-    } else if (bn_out) {
-      IR_NODE_LINK_TO(conv2d_xpu, bn_out);
-    } else if (ew_bias_add_out) {
-      IR_NODE_LINK_TO(conv2d_xpu, ew_bias_add_out);
-    } else {
-      IR_NODE_LINK_TO(conv2d_xpu, conv_out);
+    IR_NODE_LINK_TO(fusion_nodes_map["x"], conv2d_xpu);
+    if (fusion_nodes_map["x_max"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["x_max"], conv2d_xpu);
     }
-    IR_NODE_LINK_TO(conv2d_xpu, conv2d_xpu_out_max);
-    // delete useless node
-    std::unordered_set<const Node*> delete_nodes = {conv};
-    if (act != nullptr) {
-      delete_nodes.insert(act);
+    IR_NODE_LINK_TO(fusion_nodes_map["filter"], conv2d_xpu);
+    IR_NODE_LINK_TO(fusion_nodes_map["filter_max"], conv2d_xpu);
+    if (fusion_nodes_map["scale_max"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["scale_max"], conv2d_xpu);
     }
-    if (ew_branch_add != nullptr) {
-      delete_nodes.insert(ew_branch_add);
+    if (fusion_nodes_map["bias"]) {
+      SAFE_IR_NODE_LINK_TO(fusion_nodes_map["bias"], conv2d_xpu);
+    }
+    if (fusion_nodes_map["branch"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["branch"], conv2d_xpu);
+    }
+    if (fusion_nodes_map["branch_max"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["branch_max"], conv2d_xpu);
+    }
+    if (fusion_nodes_map["out_max_in"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["out_max_in"], conv2d_xpu);
+    }
+    IR_NODE_LINK_TO(conv2d_xpu, fusion_nodes_map["out"]);
+    IR_NODE_LINK_TO(conv2d_xpu, fusion_nodes_map["out_max"]);
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes;
+    if (conv != nullptr) {
+      delete_nodes.insert(conv);
     }
     if (scale != nullptr) {
       delete_nodes.insert(scale);
     }
     if (bn != nullptr) {
       delete_nodes.insert(bn);
-      delete_nodes.insert(bn_bias);
-      delete_nodes.insert(bn_var);
-      delete_nodes.insert(bn_mean);
-      delete_nodes.insert(bn_scale);
-      delete_nodes.insert(bn_var_out);
-      delete_nodes.insert(bn_mean_out);
-      delete_nodes.insert(bn_saved_var);
-      delete_nodes.insert(bn_saved_mean);
     }
     if (ew_bias_add != nullptr) {
       delete_nodes.insert(ew_bias_add);
-      delete_nodes.insert(ew_bias_add_y);
+    }
+    if (ew_branch_add != nullptr) {
+      delete_nodes.insert(ew_branch_add);
+    }
+    if (act != nullptr) {
+      delete_nodes.insert(act);
     }
     GraphSafeRemoveNodes(graph, delete_nodes);
     found_subgraph_count++;
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 4c8424b7df08f..5868db5627021 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -244,9 +244,62 @@ class FcXPUFusePass : public FusePassBase {
                 bool with_bn,
                 const std::string& act_type) const;
 
+  void CreateFusionWeightsAndBias(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      std::string mul_type,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool with_bias,
+      bool with_bn,
+      bool enable_int8) const;
+
+  void CreateFusionOutputs(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool enable_int8) const;
+
+  void CreateFusionInputs(
+      ir::Graph* graph,
+      Scope* scope,
+      BlockDesc* block,
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::map<std::string, Node*>* fusion_nodes_map,
+      bool enable_int8) const;
+
+  Node* GetNodeFromNodesMap(
+      const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+      std::string pattern_node_name,
+      std::string node_name) const;
+
   const std::string name_scope_{"fc_xpu_fuse_pass"};
 };
 
+Node* FcXPUFusePass::GetNodeFromNodesMap(
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::string pattern_node_name,
+    std::string node_name) const {
+  auto iter = nodes_map.find(pattern_node_name);
+  PADDLE_ENFORCE_EQ(
+      iter != nodes_map.end(),
+      true,
+      platform::errors::InvalidArgument("nodes_map[%s] not found in nodes_map",
+                                        pattern_node_name.c_str()));
+  auto node_map = iter->second;
+  auto node_iter = node_map.find(node_name);
+  PADDLE_ENFORCE_EQ(node_iter != node_map.end(),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "nodes_map[%s][%s] not found in nodes_map",
+                        pattern_node_name.c_str(),
+                        node_name.c_str()));
+  return node_iter->second;
+}
+
 void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::PreconditionNotMet("graph should not be null."));
@@ -275,6 +328,395 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph) const {
   AddStatis(found_subgraph_count);
 }
 
+void FcXPUFusePass::CreateFusionWeightsAndBias(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    std::string mul_type,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool with_bias,
+    bool with_bn,
+    bool enable_int8) const {
+  // Get Node
+  auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
+  PADDLE_ENFORCE_EQ(
+      mul != nullptr,
+      true,
+      platform::errors::InvalidArgument("mul node ptr can not be null"));
+  auto* mul_w = GetNodeFromNodesMap(nodes_map, "mul", "mul_w");
+  PADDLE_ENFORCE_EQ(
+      mul_w != nullptr,
+      true,
+      platform::errors::InvalidArgument("mul_w node ptr can not be null"));
+
+  // transfilter fp16 --> fp32
+  auto* filter_t =
+      scope->FindVar(mul_w->Name())->GetMutable<phi::DenseTensor>();
+  auto filter_len = filter_t->numel();
+  auto filter_dtype = filter_t->dtype();
+  if (filter_dtype == phi::DataType::FLOAT16) {
+    CastToFp32(filter_t, nullptr);
+  }
+
+  bool transpose_w = false;
+  if (mul_type == "matmul") {
+    transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y"));
+  } else if (mul_type == "matmul_v2") {
+    transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
+  }
+  // Get Weight scale in int8 scene
+  std::vector<float> weight_scale =
+      mul->Op()->GetAttrIfExists<std::vector<float>>("Input_scale_" +
+                                                     mul_w->Name());
+  // Create fusion_bias_node
+  auto filter_dims = filter_t->dims();
+  bool has_bias = with_bn || with_bias;
+  Node* fusion_bias_node = nullptr;
+  if (with_bias) {
+    auto* ew_bias_add_bias =
+        GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_bias");
+    PADDLE_ENFORCE_EQ(ew_bias_add_bias != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_bias_add_bias node ptr can not be null"));
+    auto* ew_bias_add_bias_t = scope->FindVar(ew_bias_add_bias->Name())
+                                   ->GetMutable<phi::DenseTensor>();
+    PrepareBias(graph, scope, block, ew_bias_add_bias, &fusion_bias_node);
+  }
+
+  if (with_bn) {
+    auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
+    PADDLE_ENFORCE_EQ(
+        bn != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn node ptr can not be null"));
+    auto* bn_bias = GetNodeFromNodesMap(nodes_map, "bn", "bn_bias");
+    PADDLE_ENFORCE_EQ(
+        bn_bias != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_bias node ptr can not be null"));
+    auto* bn_scale = GetNodeFromNodesMap(nodes_map, "bn", "bn_scale");
+    PADDLE_ENFORCE_EQ(
+        bn_scale != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_scale node ptr can not be null"));
+    auto* bn_var = GetNodeFromNodesMap(nodes_map, "bn", "bn_var");
+    PADDLE_ENFORCE_EQ(
+        bn_var != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_var node ptr can not be null"));
+    auto* bn_mean = GetNodeFromNodesMap(nodes_map, "bn", "bn_mean");
+    PADDLE_ENFORCE_EQ(
+        bn_mean != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_mean node ptr can not be null"));
+
+    auto bn_bias_t =
+        scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
+    PADDLE_ENFORCE_EQ(
+        filter_dims[0],
+        bn_bias_t->dims()[0],
+        platform::errors::InvalidArgument("the shape[%d] of bn bias tensor "
+                                          "must equal out_channel[%d] of conv",
+                                          bn_bias_t->dims()[0],
+                                          filter_dims[0]));
+    auto bn_scale_t =
+        scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
+    auto bn_mean_t =
+        scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
+    auto bn_var_t = scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
+    float* bn_scale_ptr =
+        bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_bias_ptr =
+        bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_mean_ptr =
+        bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_var_ptr =
+        bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
+    auto mean_len = bn_mean_t->numel();
+    auto filter_stride = filter_len / mean_len;
+    float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
+    if (!with_bias) {  // prev node is conv
+      PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
+    }
+
+    auto fusion_bias_t =
+        scope->Var(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
+    float* fusion_bias_ptr =
+        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    // recompute bias and weights
+    for (int i = 0; i < mean_len; ++i) {
+      bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
+    }
+    // recompute the weights
+    if (!enable_int8) {
+      float* filter_ptr =
+          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      for (int i = 0; i < mean_len; ++i) {
+        for (int j = 0; j < filter_stride; j++) {
+          filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
+        }
+      }
+    } else {
+      int8_t* filter_ptr =
+          filter_t->mutable_data<int8_t>(paddle::platform::CPUPlace());
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(),
+          mean_len,
+          platform::errors::InvalidArgument(
+              "Weight max_scale size must equal batch_norm sacle/mean size."));
+      for (int i = 0; i < mean_len; i++) {
+        weight_scale[i] *= fabs(bn_scale_ptr[i]);
+      }
+      for (int i = 0; i < mean_len; i++) {
+        if (bn_scale_ptr[i] < 0) {
+          for (int j = 0; j < filter_stride; ++j) {
+            filter_ptr[i * filter_stride + j] *= -1;
+          }
+        }
+      }
+    }
+    // recompute bias
+    if (!with_bias) {
+      for (int i = 0; i < mean_len; ++i) {
+        fusion_bias_ptr[i] += (0.0f - bn_mean_ptr[i]) * bn_scale_ptr[i];
+      }
+    } else {
+      for (int i = 0; i < mean_len; ++i) {
+        fusion_bias_ptr[i] =
+            bn_bias_ptr[i] +
+            (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
+      }
+    }
+  }
+
+  (*fusion_nodes_map)["bias"] = fusion_bias_node;
+
+  Node* filter_intx = nullptr;
+  Node* filter_max = nullptr;
+  Node* scale_max = nullptr;
+  if (!enable_int8) {
+    PrepareWeight<float, int16_t>(graph,
+                                  scope,
+                                  block,
+                                  mul_w,
+                                  &filter_intx,
+                                  &filter_max,
+                                  !transpose_w,
+                                  weight_scale);
+  } else {
+    PrepareWeight<int8_t, int8_t>(graph,
+                                  scope,
+                                  block,
+                                  mul_w,
+                                  &filter_intx,
+                                  &filter_max,
+                                  !transpose_w,
+                                  weight_scale);
+  }
+
+  bool is_per_channel_need_create_scale_max_node =
+      !weight_scale.empty() && !IsPerTensorQuant(weight_scale);
+  if (is_per_channel_need_create_scale_max_node) {
+    phi::DenseTensor ones_weight_max_tensor;
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    int max_ptr_size = weight_scale.empty()
+                           ? phi::backends::xpu::get_xpu_max_ptr_size(-1)
+                           : weight_scale.size();
+    ones_weight_max_tensor.set_type(phi::DataType::FLOAT32);
+    ones_weight_max_tensor.Resize({max_ptr_size});
+    std::vector<float> ones_weight(max_ptr_size, 1.0);
+    memcpy(cpu_ctx->Alloc<float>(&ones_weight_max_tensor),
+           ones_weight.data(),
+           max_ptr_size * sizeof(float));
+
+    std::string scale_max_name = mul_w->Name() + "_scale_max";
+    VarDesc scale_max_desc(scale_max_name);
+    scale_max_desc.SetPersistable(true);
+    scale_max_desc.SetShape(vectorize(ones_weight_max_tensor.dims()));
+    scale_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    scale_max = graph->CreateVarNode(&scale_max_desc);
+    auto* block_scale_max_desc = block->Var(scale_max_name);
+    block_scale_max_desc->SetPersistable(scale_max_desc.Persistable());
+    block_scale_max_desc->SetShape(scale_max_desc.GetShape());
+    block_scale_max_desc->SetDataType(scale_max_desc.GetDataType());
+    Assign(ones_weight_max_tensor,
+           scope->Var(scale_max_name)->GetMutable<phi::DenseTensor>());
+  }
+
+  (*fusion_nodes_map)["w"] = filter_intx;
+  if (is_per_channel_need_create_scale_max_node) {
+    (*fusion_nodes_map)["w_max"] = scale_max;
+    (*fusion_nodes_map)["scale_max"] = filter_max;
+  } else {
+    (*fusion_nodes_map)["w_max"] = filter_max;
+    (*fusion_nodes_map)["scale_max"] = scale_max;
+  }
+}
+
+void FcXPUFusePass::CreateFusionOutputs(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool enable_int8) const {
+  auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
+  PADDLE_ENFORCE_EQ(
+      mul != nullptr,
+      true,
+      platform::errors::InvalidArgument("mul node ptr can not be null"));
+  // output && output max
+  std::string fc_xpu_out_name;
+  Node* fc_out_op_node = nullptr;
+  Node* fc_out_var_node = nullptr;
+
+  auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
+  auto* ew_bias_add =
+      GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add");
+  auto* act = GetNodeFromNodesMap(nodes_map, "act", "act");
+  if (act) {
+    auto* act_out = GetNodeFromNodesMap(nodes_map, "act", "act_out");
+    PADDLE_ENFORCE_EQ(
+        act_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("act_out node ptr can not be null"));
+    fc_xpu_out_name = act_out->Name();
+    fc_out_var_node = act_out;
+    fc_out_op_node = act;
+  } else if (bn) {
+    auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out");
+    PADDLE_ENFORCE_EQ(
+        bn_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("bn_out node ptr can not be null"));
+    fc_xpu_out_name = bn_out->Name();
+    fc_out_var_node = bn_out;
+    fc_out_op_node = bn;
+  } else if (ew_bias_add) {
+    auto* ew_bias_add_out =
+        GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out");
+    PADDLE_ENFORCE_EQ(ew_bias_add_out != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "ew_bias_add_out node ptr can not be null"));
+    fc_xpu_out_name = ew_bias_add_out->Name();
+    fc_out_var_node = ew_bias_add_out;
+    fc_out_op_node = ew_bias_add;
+  } else {
+    auto* mul_out = GetNodeFromNodesMap(nodes_map, "mul", "mul_out");
+    PADDLE_ENFORCE_EQ(
+        mul_out != nullptr,
+        true,
+        platform::errors::InvalidArgument("mul_out node ptr can not be null"));
+    fc_xpu_out_name = mul_out->Name();
+    fc_out_var_node = mul_out;
+    fc_out_op_node = mul;
+  }
+  (*fusion_nodes_map)["out"] = fc_out_var_node;
+
+  // Create out max in
+  if (enable_int8) {
+    std::string fc_out_max_in_name = fc_xpu_out_name + "_max_in";
+    int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+    VarDesc fc_out_max_in_desc(fc_out_max_in_name);
+    fc_out_max_in_desc.SetPersistable(true);
+    fc_out_max_in_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+    fc_out_max_in_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    Node* fc_xpu_out_max_in = graph->CreateVarNode(&fc_out_max_in_desc);
+    auto* block_out_max_in_desc = block->Var(fc_out_max_in_name);
+    block_out_max_in_desc->SetPersistable(fc_out_max_in_desc.Persistable());
+    block_out_max_in_desc->SetShape(fc_out_max_in_desc.GetShape());
+    block_out_max_in_desc->SetDataType(fc_out_max_in_desc.GetDataType());
+
+    auto GetOutputScale = [&](Node* var_node, std::string name) -> float {
+      int nums_any_ops = var_node->outputs.size();
+      for (size_t i = 0; i < nums_any_ops; ++i) {
+        auto* any_op_desc = fc_out_var_node->outputs[i]->Op();
+        VLOG(1) << "any_op_desc: " << any_op_desc->Type();
+        if (any_op_desc->HasAttr("Input_scale_" + name)) {
+          VLOG(1) << "find it: "
+                  << "Input_scale_" + name;
+          return any_op_desc->GetAttrIfExists<float>("Input_scale_" + name);
+        }
+      }
+      return 0;
+    };
+    float output_scale = GetOutputScale(fc_out_var_node, fc_xpu_out_name);
+    mul->Op()->SetAttr("Input_scale_" + fc_xpu_out_name, output_scale);
+    VLOG(1) << "fc_xpu_out_name:" << fc_xpu_out_name
+            << " output_scale: " << output_scale
+            << "fc_out_var_node name:" << fc_out_var_node->Name();
+    phi::DenseTensor out_max_in_cpu_tensor;
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    out_max_in_cpu_tensor.set_type(phi::DataType::FLOAT32);
+    out_max_in_cpu_tensor.Resize({max_ptr_size});
+    std::vector<float> output_scales(max_ptr_size, output_scale);
+    memcpy(cpu_ctx->Alloc<float>(&out_max_in_cpu_tensor),
+           output_scales.data(),
+           max_ptr_size * sizeof(float));
+    Assign(out_max_in_cpu_tensor,
+           scope->Var(fc_out_max_in_name)->GetMutable<phi::DenseTensor>());
+    (*fusion_nodes_map)["out_max_in"] = fc_xpu_out_max_in;
+  }
+
+  // Create out max
+  std::string fc_out_max_name = fc_xpu_out_name + "_max";
+  VarDesc fc_out_max_desc(fc_out_max_name);
+  Node* fc_xpu_out_max = graph->CreateVarNode(&fc_out_max_desc);
+  (*fusion_nodes_map)["out_max"] = fc_xpu_out_max;
+}
+
+void FcXPUFusePass::CreateFusionInputs(
+    ir::Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
+    std::map<std::string, Node*>* fusion_nodes_map,
+    bool enable_int8) const {
+  // Get Node
+  auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
+  PADDLE_ENFORCE_EQ(
+      mul != nullptr,
+      true,
+      platform::errors::InvalidArgument("mul node ptr can not be null"));
+  auto* mul_x = GetNodeFromNodesMap(nodes_map, "mul", "mul_x");
+  PADDLE_ENFORCE_EQ(
+      mul_x != nullptr,
+      true,
+      platform::errors::InvalidArgument("mul_x node ptr can not be null"));
+  // x max
+  std::string mul_x_max_name = mul_x->Name() + "_max";
+  Node* mul_x_max = nullptr;
+  if (enable_int8) {
+    float input_scale =
+        mul->Op()->GetAttrIfExists<float>("Input_scale_" + mul_x->Name());
+    int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+    VarDesc x_max_desc(mul_x_max_name);
+    x_max_desc.SetPersistable(
+        true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
+                // device
+    x_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+    x_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    mul_x_max = graph->CreateVarNode(&x_max_desc);
+    auto input_max_tensor =
+        scope->Var(mul_x_max_name)->GetMutable<phi::DenseTensor>();
+    input_max_tensor->set_type(phi::DataType::FLOAT32);
+    input_max_tensor->Resize({max_ptr_size});
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    std::vector<float> input_scales(max_ptr_size, input_scale);
+    memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
+           input_scales.data(),
+           max_ptr_size * sizeof(float));
+  }
+  (*fusion_nodes_map)["x"] = mul_x;
+  (*fusion_nodes_map)["x_max"] = mul_x_max;
+}
+
 int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                              const std::string& mul_type,
                              bool with_bias,
@@ -287,7 +729,7 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                  with_bias,
                                  with_bn,
                                  act_type);
-
+  auto* scope = param_scope();
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
@@ -311,108 +753,78 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
     GET_IR_NODE(bn_saved_mean);
     GET_IR_NODE(act);
     GET_IR_NODE(act_out);
-    auto* block = mul->Op()->Block();
-    auto* scope = param_scope();
-
-    auto* filter_t =
-        scope->FindVar(mul_w->Name())->GetMutable<phi::DenseTensor>();
-    // weight fp16 --> fp32
-    auto filter_dtype = filter_t->dtype();
-    int out_dtype = proto::VarType::Type::VarType_Type_FP32;
-    if (filter_dtype == phi::DataType::FLOAT16) {
-      out_dtype = proto::VarType::Type::VarType_Type_FP16;
-      CastToFp32(filter_t, nullptr);
-    }
-    auto filter_dims = filter_t->dims();
+    std::map<std::string, std::map<std::string, Node*>> nodes_map;
+    nodes_map.insert(
+        {"mul", {{"mul_x", mul_x}, {"mul_w", mul_w}, {"mul_out", mul_out}}});
+    nodes_map.insert({"ew_bias_add",
+                      {{"ew_bias_add", add},
+                       {"ew_bias_add_bias", bias},
+                       {"ew_bias_add_out", add_out}}});
+    nodes_map.insert({"bn",
+                      {{"bn", bn},
+                       {"bn_bias", bn_bias},
+                       {"bn_mean", bn_mean},
+                       {"bn_scale", bn_scale},
+                       {"bn_var", bn_var},
+                       {"bn_out", bn_out},
+                       {"bn_var_out", bn_var_out},
+                       {"bn_mean_out", bn_mean_out},
+                       {"bn_saved_var", bn_saved_var},
+                       {"bn_saved_mean", bn_saved_mean}}});
+    nodes_map.insert({"act", {{"act", act}, {"act_out", act_out}}});
 
-    bool transpose_w = false;
-    if (mul_type == "matmul") {
-      transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("transpose_Y"));
-    } else if (mul_type == "matmul_v2") {
-      transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
-    }
+    std::map<std::string, Node*> fusion_nodes_map{{"x", nullptr},
+                                                  {"x_max", nullptr},
+                                                  {"w", nullptr},
+                                                  {"w_max", nullptr},
+                                                  {"bias", nullptr},
+                                                  {"scale_max", nullptr},
+                                                  {"out_max_in", nullptr},
+                                                  {"out", nullptr},
+                                                  {"out_max", nullptr}};
 
-    bool has_bias = with_bn || with_bias;
-    Node* fusion_bias_node = nullptr;
-    if (has_bias) {
-      if (bias != nullptr) {
-        PrepareBias(graph, scope, block, bias, &fusion_bias_node);
-      }
-      if (bn != nullptr) {
-        auto bn_bias_t =
-            scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
-        auto bn_scale_t =
-            scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
-        auto bn_mean_t =
-            scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
-        auto bn_var_t =
-            scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
-        float* mul_w_ptr = filter_t->data<float>();
-        float* bn_scale_ptr = bn_scale_t->data<float>();
-        float* bn_bias_ptr = bn_bias_t->data<float>();
-        float* bn_mean_ptr = bn_mean_t->data<float>();
-        float* bn_var_ptr = bn_var_t->data<float>();
-        auto mean_len = bn_mean_t->numel();
-        auto filter_h = filter_dims[0];
-        auto filter_w = filter_dims[1];
-        float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
-        if (fusion_bias_node == nullptr) {  // prev node is conv
-          PrepareBias(graph, scope, block, bn_bias, &fusion_bias_node);
-        }
-        auto fusion_bias_t = scope->Var(fusion_bias_node->Name())
-                                 ->GetMutable<phi::DenseTensor>();
-        float* fusion_bias_ptr = fusion_bias_t->data<float>();
-        // recompute bias and weights
-        if (bias == nullptr) {
-          for (int i = 0; i < mean_len; ++i) {
-            bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
-            fusion_bias_ptr[i] += (0.f - bn_mean_ptr[i]) * bn_scale_ptr[i];
-            for (int j = 0; j < filter_h; j++) {
-              mul_w_ptr[j * filter_w + i] *= bn_scale_ptr[i];
-            }
-          }
-        } else {
-          for (int i = 0; i < mean_len; ++i) {
-            bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
-            bn_bias_ptr[i] +=
-                (fusion_bias_ptr[i] - bn_mean_ptr[i]) * bn_scale_ptr[i];
-            for (int j = 0; j < filter_h; j++) {
-              mul_w_ptr[j * filter_w + i] *= bn_scale_ptr[i];
-            }
-          }
-          memcpy(fusion_bias_ptr, bn_bias_ptr, mean_len * sizeof(float));
-        }
-      }
-    }
-
-    Node* mul_w_int16 = nullptr;
-    Node* mul_w_max = nullptr;
-    PrepareWeight<int16_t>(
-        graph, scope, block, mul_w, &mul_w_int16, &mul_w_max, !transpose_w);
-
-    std::string fc_out_name;
-    if (act_out) {
-      fc_out_name = act_out->Name();
-    } else if (bn) {
-      fc_out_name = bn_out->Name();
-    } else if (add_out) {
-      fc_out_name = add_out->Name();
-    } else {
-      fc_out_name = mul_out->Name();
-    }
-    std::string fc_out_max_name = fc_out_name + "_max";
-    VarDesc fc_out_max_desc(fc_out_max_name);
-    Node* fc_out_max = graph->CreateVarNode(&fc_out_max_desc);
+    bool enable_int8 = mul->Op()->GetAttrIfExists<bool>("enable_int8");
+    std::string op_precision_str = enable_int8 ? "int8" : "fp32";
+    VLOG(4) << "FC fusion fuse pass is running on " << op_precision_str
+            << " precision!";
+    auto* block = mul->Op()->Block();
+    CreateFusionWeightsAndBias(graph,
+                               scope,
+                               block,
+                               mul_type,
+                               nodes_map,
+                               &fusion_nodes_map,
+                               with_bias,
+                               with_bn,
+                               enable_int8);
+    CreateFusionInputs(
+        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
+    CreateFusionOutputs(
+        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
+    VLOG(1) << "CreateFusionOutputs success!";
 
     // Generate fc_xpu op
     framework::OpDesc fc_xpu_op_desc(block);
     fc_xpu_op_desc.SetType("fc_xpu");
-    fc_xpu_op_desc.SetInput("x", {mul_x->Name()});
-    fc_xpu_op_desc.SetInput("w", {mul_w_int16->Name()});
-    fc_xpu_op_desc.SetInput("w_max", {mul_w_max->Name()});
-    if (has_bias) {
-      fc_xpu_op_desc.SetInput("bias", {fusion_bias_node->Name()});
+    fc_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()});
+    if (fusion_nodes_map["x_max"]) {
+      fc_xpu_op_desc.SetInput("x_max", {fusion_nodes_map["x_max"]->Name()});
+    }
+    fc_xpu_op_desc.SetInput("w", {fusion_nodes_map["w"]->Name()});
+    fc_xpu_op_desc.SetInput("w_max", {fusion_nodes_map["w_max"]->Name()});
+    if (fusion_nodes_map["bias"]) {
+      fc_xpu_op_desc.SetInput("bias", {fusion_nodes_map["bias"]->Name()});
+    }
+    if (fusion_nodes_map["scale_max"]) {
+      fc_xpu_op_desc.SetInput("scale_max",
+                              {fusion_nodes_map["scale_max"]->Name()});
+    }
+    if (fusion_nodes_map["out_max_in"]) {
+      fc_xpu_op_desc.SetInput("out_max_in",
+                              {fusion_nodes_map["out_max_in"]->Name()});
     }
+    fc_xpu_op_desc.SetOutput("out", {fusion_nodes_map["out"]->Name()});
+    fc_xpu_op_desc.SetOutput("out_max", {fusion_nodes_map["out_max"]->Name()});
     fc_xpu_op_desc.SetAttr(
         "in_num_col_dims",
         static_cast<int>(mul_x->Var()->GetShape().size() - 1));
@@ -440,48 +852,54 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
             "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
       }
     }
-    fc_xpu_op_desc.SetAttr("out_dtype", out_dtype);
-    fc_xpu_op_desc.SetOutput("out", {fc_out_name});
-    fc_xpu_op_desc.SetOutput("out_max", {fc_out_max_name});
+    // out_dtype is same to input precision
+    fc_xpu_op_desc.SetAttr("out_dtype",
+                           fusion_nodes_map["x"]->Var()->GetDataType());
+    fc_xpu_op_desc.SetAttr("enable_int8",
+                           mul->Op()->GetAttrIfExists<bool>("enable_int8"));
+    if (enable_int8) {
+      fc_xpu_op_desc.SetAttr(
+          "Input_scale_" + fusion_nodes_map["out"]->Name(),
+          mul->Op()->GetAttrIfExists<float>("Input_scale_" +
+                                            fusion_nodes_map["out"]->Name()));
+      fc_xpu_op_desc.SetAttr(
+          "Input_scale_" + fusion_nodes_map["x"]->Name(),
+          mul->Op()->GetAttrIfExists<float>("Input_scale_" +
+                                            fusion_nodes_map["x"]->Name()));
+    }
+
     auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
-    IR_NODE_LINK_TO(mul_x, fc_xpu);
-    IR_NODE_LINK_TO(mul_w_int16, fc_xpu);
-    IR_NODE_LINK_TO(mul_w_max, fc_xpu);
-    if (bias || bn) {
-      SAFE_IR_NODE_LINK_TO(fusion_bias_node, fc_xpu);
+    IR_NODE_LINK_TO(fusion_nodes_map["x"], fc_xpu);
+    if (fusion_nodes_map["x_max"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["x_max"], fc_xpu);
     }
-    if (act_out) {
-      IR_NODE_LINK_TO(fc_xpu, act_out);
-    } else if (bn_out) {
-      IR_NODE_LINK_TO(fc_xpu, bn_out);
-    } else if (add_out) {
-      IR_NODE_LINK_TO(fc_xpu, add_out);
-    } else {
-      IR_NODE_LINK_TO(fc_xpu, mul_out);
+    IR_NODE_LINK_TO(fusion_nodes_map["w"], fc_xpu);
+    IR_NODE_LINK_TO(fusion_nodes_map["w_max"], fc_xpu);
+    if (fusion_nodes_map["scale_max"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["scale_max"], fc_xpu);
     }
-    IR_NODE_LINK_TO(fc_xpu, fc_out_max);
+    if (fusion_nodes_map["bias"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["bias"], fc_xpu);
+    }
+    if (fusion_nodes_map["out_max_in"]) {
+      IR_NODE_LINK_TO(fusion_nodes_map["out_max_in"], fc_xpu);
+    }
+    IR_NODE_LINK_TO(fc_xpu, fusion_nodes_map["out"]);
+    IR_NODE_LINK_TO(fc_xpu, fusion_nodes_map["out_max"]);
 
     // delete useless node
     std::unordered_set<const Node*> delete_nodes;
-    if (act != nullptr && add != nullptr) {
-      delete_nodes = {mul, mul_out, add, add_out, act};
-    } else if (act) {
-      delete_nodes = {mul, mul_out, act};
-    } else if (add) {
-      delete_nodes = {mul, mul_out, add};
-    } else {
-      delete_nodes = {mul};
+    if (mul != nullptr) {
+      delete_nodes.insert(mul);
     }
     if (bn != nullptr) {
       delete_nodes.insert(bn);
-      delete_nodes.insert(bn_bias);
-      delete_nodes.insert(bn_var);
-      delete_nodes.insert(bn_mean);
-      delete_nodes.insert(bn_scale);
-      delete_nodes.insert(bn_var_out);
-      delete_nodes.insert(bn_mean_out);
-      delete_nodes.insert(bn_saved_var);
-      delete_nodes.insert(bn_saved_mean);
+    }
+    if (add != nullptr) {
+      delete_nodes.insert(add);
+    }
+    if (act != nullptr) {
+      delete_nodes.insert(act);
     }
     GraphSafeRemoveNodes(graph, delete_nodes);
     found_subgraph_count++;
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index 1a9db472bc2cc..3a6d29f794d65 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -77,8 +77,14 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern,
                                      const std::string& name_scope,
                                      bool with_branch)
     : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) {
-  auto* fusion_op =
-      pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu");
+  auto* fusion_op = pattern->NewNode(fusion_op_repr())
+                        ->assert_is_op("conv2d_xpu")
+                        ->assert_more([&](Node* node) {
+                          bool enable_int8 =
+                              node->Op()->GetAttrIfExists<bool>("enable_int8");
+                          return !enable_int8;
+                        });
+
   auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x");
   PDNode* branch = nullptr;
   if (with_branch_) {
@@ -177,7 +183,12 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const {
       auto preop_max_var_name = x_pre_op->Output("out_max");
       for (auto max_node : x->inputs[0]->outputs) {
         if (preop_max_var_name[0] == max_node->Name()) {
-          fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          if (fusion_op_desc->HasInput("x_max")) {
+            auto x_max_old_name = fusion_op_desc->Input("x_max")[0];
+            fusion_op_desc->RenameInput(x_max_old_name, max_node->Name());
+          } else {
+            fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          }
           IR_NODE_LINK_TO(max_node, fusion_op);
         }
       }
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc
index eeb0e23e19ecd..b895033108e12 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc
@@ -121,12 +121,115 @@ size_t HashTensor(const phi::DenseTensor& in) {
 
 template size_t HashTensor<int16_t>(const phi::DenseTensor& in);
 template size_t HashTensor<float>(const phi::DenseTensor& in);
+template size_t HashTensor<int8_t>(const phi::DenseTensor& in);
 
 std::string GetPrefixWithoutHash(const std::string& name) {
   std::size_t found = name.find("_#");
   return found == std::string::npos ? name : name.substr(0, found);
 }
 
+template <typename Tcpu, typename Txpu>
+void PrepareWeight(Graph* graph,
+                   Scope* scope,
+                   BlockDesc* block,
+                   Node* weight,
+                   Node** quant_weight,
+                   Node** quant_weight_max,
+                   bool transpose,
+                   const std::vector<float>& weight_scales) {
+  auto weight_name = weight->Name();
+  auto* weight_tensor = scope->Var(weight_name)->GetMutable<phi::DenseTensor>();
+  phi::DenseTensor quant_weight_tensor;
+  Assign(*weight_tensor, &quant_weight_tensor);
+  phi::DenseTensor quant_weight_max_tensor;
+  ConvertWeightWrapper<Tcpu, Txpu>(
+      &quant_weight_tensor, &quant_weight_max_tensor, transpose, weight_scales);
+  size_t quant_weight_hash = HashTensor<Txpu>(quant_weight_tensor);
+  size_t quant_weight_max_hash = HashTensor<float>(quant_weight_max_tensor);
+  std::string pre_name = GetPrefixWithoutHash(weight_name);
+  std::string quant_weight_name =
+      pre_name + "_#" + std::to_string(quant_weight_hash);
+  std::string quant_weight_max_name =
+      pre_name + "_max_#" + std::to_string(quant_weight_max_hash);
+  *quant_weight = FindNodeWithName(graph, quant_weight_name);
+  if (*quant_weight == nullptr) {
+    // Create quant_weight node
+    // Update quant_weight var_desc in block
+    VarDesc quant_weight_desc(quant_weight_name);
+    quant_weight_desc.SetPersistable(true);
+    quant_weight_desc.SetShape(vectorize(quant_weight_tensor.dims()));
+    quant_weight_desc.SetDataType(
+        framework::TransToProtoVarType(quant_weight_tensor.dtype()));
+    *quant_weight = graph->CreateVarNode(&quant_weight_desc);
+    auto* block_quant_weight_desc = block->Var(quant_weight_name);
+    block_quant_weight_desc->SetPersistable(quant_weight_desc.Persistable());
+    block_quant_weight_desc->SetShape(quant_weight_desc.GetShape());
+    block_quant_weight_desc->SetDataType(quant_weight_desc.GetDataType());
+    // Create quant_weight_max node
+    // Update quant_weight_max var_desc in block
+    VarDesc quant_weight_max_desc(quant_weight_max_name);
+    quant_weight_max_desc.SetPersistable(true);
+    quant_weight_max_desc.SetShape(vectorize(quant_weight_max_tensor.dims()));
+    quant_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    *quant_weight_max = graph->CreateVarNode(&quant_weight_max_desc);
+    auto* block_quant_weight_max_desc = block->Var(quant_weight_max_name);
+    block_quant_weight_max_desc->SetPersistable(
+        quant_weight_max_desc.Persistable());
+    block_quant_weight_max_desc->SetShape(quant_weight_max_desc.GetShape());
+    block_quant_weight_max_desc->SetDataType(
+        quant_weight_max_desc.GetDataType());
+    // Find dst/dst_max variable in scope
+    auto* quant_weight_var = scope->FindVar(quant_weight_name);
+    if (quant_weight_var == nullptr) {
+      // Create quant_weight/quant_weight_max variable/tensor
+      Assign(quant_weight_tensor,
+             scope->Var(quant_weight_name)->GetMutable<phi::DenseTensor>());
+      Assign(quant_weight_max_tensor,
+             scope->Var(quant_weight_max_name)->GetMutable<phi::DenseTensor>());
+    } else {
+      // Share the same variable
+      PADDLE_ENFORCE_NOT_NULL(
+          scope->FindVar(quant_weight_max_name),
+          platform::errors::Fatal("quant_weight_max(%s) variable should not be "
+                                  "nullptr if quant_weight(%s) "
+                                  "variable is exist. (weight_name is %s)",
+                                  quant_weight_max_name,
+                                  quant_weight_name,
+                                  weight_name));
+    }
+  } else {
+    *quant_weight_max = FindNodeWithName(graph, quant_weight_max_name);
+    PADDLE_ENFORCE_NOT_NULL(
+        *quant_weight_max,
+        platform::errors::Fatal("quant_weight_max(%s) variable should not be "
+                                "nullptr if quant_weight(%s) "
+                                "variable is exist. (weight_name is %s)",
+                                quant_weight_max_name,
+                                quant_weight_name,
+                                weight_name));
+  }
+}
+
+template void PrepareWeight<float, int16_t>(
+    Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    Node* weight,
+    Node** quant_weight,
+    Node** quant_weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
+
+template void PrepareWeight<int8_t, int8_t>(
+    Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    Node* weight,
+    Node** quant_weight,
+    Node** quant_weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
+
 template <typename T>
 void PrepareWeight(Graph* graph,
                    Scope* scope,
@@ -137,6 +240,7 @@ void PrepareWeight(Graph* graph,
                    bool transpose) {
   auto src_name = src->Name();
   auto* src_tensor = scope->Var(src_name)->GetMutable<phi::DenseTensor>();
+
   phi::DenseTensor dst_tensor;
   Assign(*src_tensor, &dst_tensor);
   phi::DenseTensor dst_max_tensor;
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h
index d1e7b218a0b46..556a14fa0e9e4 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.h
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.h
@@ -57,6 +57,28 @@ std::vector<Node*> FindOpNodeByInputName(Graph* graph,
 template <typename T>
 size_t HashTensor(const phi::DenseTensor& in);
 
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<!std::is_same<Tcpu, Txpu>::value, Tcpu>::type*
+              ptr = nullptr>
+void ConvertWeightWrapper(phi::DenseTensor* weight,
+                          phi::DenseTensor* weight_max,
+                          bool transpose,
+                          const std::vector<float>& weight_scales) {
+  ConvertWithQuant<Tcpu, Txpu>(weight, weight_max, transpose, weight_scales);
+}
+
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<std::is_same<Tcpu, Txpu>::value, Tcpu>::type*
+              ptr = nullptr>
+void ConvertWeightWrapper(phi::DenseTensor* weight,
+                          phi::DenseTensor* weight_max,
+                          bool transpose,
+                          const std::vector<float>& weight_scales) {
+  ConvertWithoutQuant<Tcpu>(weight, weight_max, transpose, weight_scales);
+}
+
 template <typename T>
 void PrepareWeight(Graph* graph,
                    Scope* scope,
@@ -66,6 +88,16 @@ void PrepareWeight(Graph* graph,
                    Node** dst_max,
                    bool transpose);
 
+template <typename Tcpu, typename Txpu>
+void PrepareWeight(Graph* graph,
+                   Scope* scope,
+                   BlockDesc* block,
+                   Node* weight,
+                   Node** quant_weight,
+                   Node** quant_weight_max,
+                   bool transpose,
+                   const std::vector<float>& weight_scales);
+
 void PrepareBias(
     Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst);
 
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc
index fcda50051a362..ada4a4b9b6c2f 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc
@@ -145,6 +145,41 @@ void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out) {
   }
 }
 
+void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out) {
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(
+      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+
+  paddle::experimental::CheckAndTrans2Contiguous(in);
+
+  phi::DenseTensor int8_tensor;
+  phi::DenseTensor* out_ptr = out == nullptr ? &int8_tensor : out;
+  out_ptr->Resize(in->dims());
+  out_ptr->set_type(phi::DataType::INT8);
+  out_ptr->set_layout(in->layout());
+
+  switch (in->dtype()) {
+    case phi::DataType::FLOAT32:
+      phi::CastKernel<float>(*cpu_ctx, *in, phi::DataType::INT8, out_ptr);
+      break;
+    case phi::DataType::INT8:
+      if (out == nullptr) {
+        return;
+      } else {
+        phi::AssignKernel(*cpu_ctx, *in, out_ptr);
+      }
+      break;
+    default:
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Only support fp32, but received dtype is %s.",
+          phi::DataTypeToString(in->dtype())));
+      break;
+  }
+
+  if (out == nullptr) {
+    Assign(*out_ptr, in);
+  }
+}
+
 static float FindMaxAbs(const float* data, int len) {
   float max_f = 0.0f;
   for (int i = 0; i < len; ++i) {
@@ -258,6 +293,100 @@ void QuantFP32ToIntX<int8_t>(const float* src_ptr,
   }
 }
 
+template <
+    typename Tcpu,
+    typename Txpu,
+    typename std::enable_if<!std::is_same<Tcpu, float>::value, Tcpu>::type* ptr>
+void ConvertWithQuant(phi::DenseTensor* weight,
+                      phi::DenseTensor* weight_max,
+                      bool transpose,
+                      const std::vector<float>& weight_scales) {
+  LOG(FATAL) << "Not support for Tcpu is "
+             << phi::CppTypeToDataType<Tcpu>::Type();
+}
+
+template <
+    typename Tcpu,
+    typename Txpu,
+    typename std::enable_if<std::is_same<Tcpu, float>::value, Tcpu>::type* ptr>
+void ConvertWithQuant(phi::DenseTensor* weight,
+                      phi::DenseTensor* weight_max,
+                      bool transpose,
+                      const std::vector<float>& weight_scales) {
+  if (!weight_scales.empty()) {
+    LOG(FATAL) << "Weight scales should be empty(), otherwise, check if your "
+                  "model is quant model or not.";
+  }
+
+  // Convert fp16 to fp32
+  phi::DenseTensor weight_fp32;
+  CastToFp32(weight, &weight_fp32);
+
+  if (transpose) {
+    Transpose2D(&weight_fp32);
+  }
+
+  // Find max
+  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+  int size = weight_fp32.numel();
+  auto* weight_data = weight_fp32.data<float>();
+  float max_val = FindMaxAbs(weight_data, size);
+  std::vector<float> max_vec(max_ptr_size, max_val);
+  weight_max->set_type(phi::DataType::FLOAT32);
+  weight_max->Resize({max_ptr_size});
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(
+      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+  memcpy(cpu_ctx->Alloc<float>(weight_max),
+         max_vec.data(),
+         max_ptr_size * sizeof(float));
+
+  // Quant
+  weight->set_type(phi::CppTypeToDataType<Txpu>::Type());
+  weight->Resize(weight_fp32.dims());
+  QuantFP32ToIntX<Txpu>(
+      weight_data, cpu_ctx->Alloc<Txpu>(weight), max_val, size);
+}
+
+template <typename T>
+void ConvertWithoutQuant(phi::DenseTensor* weight,
+                         phi::DenseTensor* weight_max,
+                         bool transpose,
+                         const std::vector<float>& weight_scales) {
+  if (transpose) {
+    Transpose2D(weight);
+  }
+  if (std::is_same<T, int8_t>::value || std::is_same<T, int16_t>::value) {
+    auto* cpu_ctx = static_cast<phi::CPUContext*>(
+        platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+    int max_ptr_size = weight_scales.empty()
+                           ? phi::backends::xpu::get_xpu_max_ptr_size(-1)
+                           : weight_scales.size();
+    weight_max->set_type(phi::DataType::FLOAT32);
+    weight_max->Resize({max_ptr_size});
+    if (!weight_scales.empty()) {
+      memcpy(cpu_ctx->Alloc<float>(weight_max),
+             weight_scales.data(),
+             max_ptr_size * sizeof(float));
+    } else {
+      LOG(FATAL) << "weight scales cannot be empty!";
+    }
+  } else {
+    LOG(FATAL) << "Only support int8<->int8 and int16<->int16 convert.";
+  }
+}
+
+template void ConvertWithQuant<float, int16_t>(
+    phi::DenseTensor* weight,
+    phi::DenseTensor* weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
+
+template void ConvertWithoutQuant<int8_t>(
+    phi::DenseTensor* weight,
+    phi::DenseTensor* weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
+
 template <typename T>
 void PrepareWeight(phi::DenseTensor* weight,
                    phi::DenseTensor* weight_max,
@@ -298,6 +427,23 @@ template void PrepareWeight<int8_t>(phi::DenseTensor* weight,
                                     phi::DenseTensor* weight_max,
                                     bool transpose);
 
+bool IsPerTensorQuant(const std::vector<float>& weight_max) {
+  bool per_tensor = true;
+  PADDLE_ENFORCE_GT(
+      weight_max.size(),
+      0,
+      platform::errors::InvalidArgument(
+          "Op's channel size: [%d] should great than zero", weight_max.size()));
+  auto first = weight_max[0];
+  for (size_t i = 1; i < weight_max.size(); ++i) {
+    if (std::abs(first - weight_max[i]) > 1e-6) {
+      per_tensor = false;
+      break;
+    }
+  }
+  return per_tensor;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h
index b417fa03323db..30f73023b632d 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.h
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.h
@@ -25,8 +25,34 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
 void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
+void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
+
 void CastToInt32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
+template <typename T>
+void ConvertWithoutQuant(phi::DenseTensor* weight,
+                         phi::DenseTensor* weight_max,
+                         bool transpose,
+                         const std::vector<float>& weight_scales);
+
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<std::is_same<Tcpu, float>::value, Tcpu>::type*
+              ptr = nullptr>
+void ConvertWithQuant(phi::DenseTensor* weight,
+                      phi::DenseTensor* weight_max,
+                      bool transpose,
+                      const std::vector<float>& weight_scales);
+
+template <typename Tcpu,
+          typename Txpu,
+          typename std::enable_if<!std::is_same<Tcpu, float>::value,
+                                  Tcpu>::type* ptr = nullptr>
+void ConvertWithQuant(phi::DenseTensor* weight,
+                      phi::DenseTensor* weight_max,
+                      bool transpose,
+                      const std::vector<float>& weight_scales);
+
 // 1. Quant weight from fp32 to int16/int31
 // 2. Weight data is in-place update.
 // 3. Generate weight max tensor
@@ -35,6 +61,8 @@ void PrepareWeight(phi::DenseTensor* weight,
                    phi::DenseTensor* weight_max,
                    bool transpose);
 
+bool IsPerTensorQuant(const std::vector<float>& weight_max);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 0af6876faca05..2561e14d06d1e 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -13,8 +13,13 @@ cc_library(
 cc_library(
   convert_to_mixed_precision
   SRCS convert_to_mixed_precision.cc
-  DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass
-       constant_folding_pass identity_op_clean_pass)
+  DEPS analysis_pass
+       ir_graph_build_pass
+       auto_mixed_precision_pass
+       constant_folding_pass
+       identity_op_clean_pass
+       delete_quant_dequant_linear_op_pass
+       delete_weight_dequant_linear_op_pass)
 cc_library(
   ir_params_sync_among_devices_pass
   SRCS ir_params_sync_among_devices_pass.cc
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index d706113307009..ef352712102c4 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -17,6 +17,8 @@
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 #include "paddle/fluid/framework/ir/constant_folding_pass.h"
+#include "paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.h"
+#include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/identity_op_clean_pass.h"
 #include "paddle/fluid/inference/io.h"
@@ -89,12 +91,22 @@ void ConvertToMixedPrecisionPass::LoadModel() {
 void ConvertToMixedPrecisionPass::Run() {
   LoadModel();
 
+  if (backend_ == phi::Backend::XPU) {
+    framework::ir::DeleteQuantDequantLinearOpPass
+        delete_quant_dequant_linear_op_pass;
+    delete_quant_dequant_linear_op_pass.Apply(main_graph_.get());
+    framework::ir::DeleteWeightDequantLinearOpPass
+        delete_weight_dequant_linear_op_pass;
+    delete_weight_dequant_linear_op_pass.Apply(main_graph_.get());
+  }
+
   framework::ir::ConstantFoldingPass constant_folding_pass;
   constant_folding_pass.Apply(main_graph_.get());
 
   framework::ir::AutoMixedPrecisionPass auto_mixed_precision_pass;
   auto_mixed_precision_pass.Set("mixed_precision_mode",
                                 new int{static_cast<int>(mixed_precision_)});
+
   if (backend_ == phi::Backend::GPU) {
     auto_mixed_precision_pass.Set("enable_gpu_mixed", new bool{true});
   } else if (backend_ == phi::Backend::XPU) {
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c7f3f87a4d192..65fd8a74aa101 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -507,6 +507,8 @@ void CpuPassStrategy::EraseFcMkldnnPasses() {
 
 XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
+      "delete_quant_dequant_linear_op_pass",
+      "delete_weight_dequant_linear_op_pass",
       "delete_assign_op_pass",
       "delete_dropout_op_pass",
       "delete_concat_op_pass",
@@ -562,6 +564,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "delete_isolated_node_pass",
       // "auto_mixed_precision_pass",
       "cast_mixed_precision_op_fuse_pass",
+      "auto_trans_quantize_op_precision_pass",
       "inplace_op_var_pass",
   });
   use_xpu_ = true;
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index 226c87b35d458..7d8f28c61d49f 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -62,14 +62,14 @@
   optional : bias, x_max
 
 - op : conv2d_xpu
-  args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, int act_type, float act_param, DataType out_dtype)
+  args : (Tensor x, Tensor x_max, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, Tensor branch_max, Tensor scale_max, Tensor out_max_in, int[] paddings, int[] dilations, int[] strides, str padding_algorithm, int groups, int act_type, float act_param, DataType out_dtype)
   output : Tensor(out), Tensor(out_max)
   infer_meta :
     func : Conv2dXPUInferMeta
   kernel :
     func : conv2d_xpu
     data_type : x
-  optional : bias, branch, branch_max ,x_max
+  optional : bias, branch, branch_max ,x_max, scale_max, out_max_in
 
 - op : embedding_with_eltwise_add_xpu
   args : (Tensor[] ids, Tensor[] tables, Tensor mask, int64_t padding_idx)
@@ -101,14 +101,14 @@
     data_type : x
 
 - op : fc_xpu
-  args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha, DataType out_dtype)
+  args : (Tensor x, Tensor x_max, Tensor w, Tensor w_max, Tensor bias, Tensor scale_max, Tensor out_max_in, int in_num_col_dims, bool transpose_x, float alpha, float beta, int act_type, float act_alpha, DataType out_dtype)
   output : Tensor(out), Tensor(out_max)
   infer_meta :
     func : FcXPUInferMeta
   kernel :
     func : fc_xpu
     data_type : x
-  optional : bias, x_max
+  optional : bias, x_max, scale_max, out_max_in
 
 - op : fused_bias_act
   args : (Tensor x, Tensor bias, Tensor dequant_scales, Tensor shift, Tensor smooth, str act_method = "gelu", str compute_dtype = "default", float quant_scale = -1, int quant_round_type = 1, float quant_max_bound = 127.0, float quant_min_bound = -127.0)
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index 39defa8bdddd7..f8139af52cb22 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -176,7 +176,9 @@ XPUOpMap& get_kl2_ops() {
       {"conv1d_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"conv2d_xpu",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT8})},
       {"conv3d_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"conv3d",
@@ -317,7 +319,9 @@ XPUOpMap& get_kl2_ops() {
       {"fast_layernorm_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"fc_xpu",
-       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::INT8})},
       {"fill",
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
index 8dfdf7f89fde7..679eb70ccfd01 100644
--- a/paddle/phi/infermeta/fusion.cc
+++ b/paddle/phi/infermeta/fusion.cc
@@ -227,6 +227,8 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& bias,
                         const MetaTensor& branch,
                         const MetaTensor& branch_max,
+                        const MetaTensor& scale_max,
+                        const MetaTensor& out_max_in,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
@@ -377,6 +379,8 @@ void FcXPUInferMeta(const MetaTensor& x,
                     const MetaTensor& w,
                     const MetaTensor& w_max,
                     const MetaTensor& bias,
+                    const MetaTensor& scale_max,
+                    const MetaTensor& out_max_in,
                     int in_num_col_dims,
                     bool transpose_x,
                     float alpha,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
index ecda5cb9c8818..08469f4cec577 100644
--- a/paddle/phi/infermeta/fusion.h
+++ b/paddle/phi/infermeta/fusion.h
@@ -62,6 +62,8 @@ void Conv2dXPUInferMeta(const MetaTensor& x,
                         const MetaTensor& bias,
                         const MetaTensor& branch,
                         const MetaTensor& branch_max,
+                        const MetaTensor& scale_max,
+                        const MetaTensor& out_max_in,
                         const std::vector<int>& paddings,
                         const std::vector<int>& dilations,
                         const std::vector<int>& strides,
@@ -86,6 +88,8 @@ void FcXPUInferMeta(const MetaTensor& x,
                     const MetaTensor& w,
                     const MetaTensor& w_max,
                     const MetaTensor& bias,
+                    const MetaTensor& scale_max,
+                    const MetaTensor& out_max_in,
                     int in_num_col_dims,
                     bool transpose_x,
                     float alpha,
diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
index 43caa13698b48..9dce663de72c7 100644
--- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
@@ -12,9 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/cpu/conv_util.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
 
 namespace phi {
 namespace fusion {
@@ -32,6 +35,8 @@ void Conv2dXPUKernelImpl(const Context& ctx,
                          const paddle::optional<DenseTensor>& bias,
                          const paddle::optional<DenseTensor>& branch,
                          const paddle::optional<DenseTensor>& branch_max,
+                         const paddle::optional<DenseTensor>& scale_max,
+                         const paddle::optional<DenseTensor>& out_max_in,
                          const std::vector<int>& paddings,
                          const std::vector<int>& dilations,
                          const std::vector<int>& strides,
@@ -66,14 +71,22 @@ void Conv2dXPUKernelImpl(const Context& ctx,
   int out_c = static_cast<int>(filter_dims[0]);
   int win_h = static_cast<int>(filter_dims[2]);
   int win_w = static_cast<int>(filter_dims[3]);
-
+  VLOG(1) << "KERNEL1";
   auto* input_data = reinterpret_cast<const XPUTypeX*>(x.data<T_X>());
+  VLOG(1) << "KERNEL1.5";
   const float* input_max_data =
       x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
+  VLOG(1) << "KERNEL2";
   auto* filter_data = reinterpret_cast<const XPUTypeW*>(filter.data<T_W>());
   auto* filter_max_data = filter_max.data<float>();
+  auto* scale_max_data = scale_max.get_ptr() == nullptr
+                             ? nullptr
+                             : scale_max.get_ptr()->data<float>();
 
   const XPUTypeOut* branch_data = nullptr;
+  const float* branch_max_data = branch_max.get_ptr() == nullptr
+                                     ? nullptr
+                                     : branch_max.get_ptr()->data<float>();
   auto* branch_tensor = branch.get_ptr();
   xpu::ctx_guard RAII_GUARD(ctx.x_context());
   if (branch_tensor != nullptr) {
@@ -81,32 +94,269 @@ void Conv2dXPUKernelImpl(const Context& ctx,
       branch_data =
           reinterpret_cast<const XPUTypeOut*>(branch_tensor->data<T_OUT>());
     } else {
-      auto branch_data_temp =
-          RAII_GUARD.alloc_l3_or_gm<XPUTypeOut>(branch_tensor->numel());
-      int r = xpu::cast<XPUTypeX, XPUTypeOut>(
-          ctx.x_context(),
-          reinterpret_cast<const XPUTypeX*>(branch_tensor->data<T_X>()),
-          branch_data_temp,
-          branch_tensor->numel());
-      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-      branch_data = branch_data_temp;
+      if (branch_tensor->dtype() == phi::DataType::FLOAT32 &&
+          out->dtype() == phi::DataType::INT8) {
+        VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT32 && "
+                   "out->dtype() == phi::DataType::INT8";
+        auto branch_data_temp =
+            RAII_GUARD.alloc_l3_or_gm<int8_t>(branch_tensor->numel());
+        int r = xpu::quantization<float, int8_t>(
+            ctx.x_context(),
+            reinterpret_cast<const float*>(branch_tensor->data<float>()),
+            branch_data_temp,
+            branch_tensor->numel(),
+            branch_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
+        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
+      } else if (branch_tensor->dtype() == phi::DataType::FLOAT16 &&
+                 out->dtype() == phi::DataType::INT8) {
+        VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT16 && "
+                   "out->dtype() == phi::DataType::INT8";
+        auto branch_data_temp =
+            RAII_GUARD.alloc_l3_or_gm<int8_t>(branch_tensor->numel());
+        int r = xpu::quantization<XPUTypeFP16, int8_t>(
+            ctx.x_context(),
+            reinterpret_cast<const XPUTypeFP16*>(
+                branch_tensor->data<dtype::float16>()),
+            branch_data_temp,
+            branch_tensor->numel(),
+            branch_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
+        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
+      } else if (branch_tensor->dtype() == phi::DataType::INT8 &&
+                 out->dtype() == phi::DataType::FLOAT32) {
+        VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && "
+                   "out->dtype() == phi::DataType::FLOAT32";
+        // if (branch_tensor) {
+        //   DenseTensor temp_tensor_cpu;
+        //   ctx.template HostAlloc(&temp_tensor_cpu,
+        //                          branch.get_ptr()->dtype(),
+        //                          branch.get_ptr()->numel() * sizeof(int8_t));
+        //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false,
+        //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
+        //     VLOG(1) << "branch_data_quantize_before[" << i
+        //             << "]:" <<
+        //             static_cast<float>(temp_tensor_cpu.data<int8_t>()[i]);
+        //   }
+        // }
+        auto branch_data_temp =
+            RAII_GUARD.alloc_l3_or_gm<float>(branch_tensor->numel());
+        int r = xpu::dequantization<int8_t, float>(
+            ctx.x_context(),
+            reinterpret_cast<const int8_t*>(branch_tensor->data<int8_t>()),
+            branch_data_temp,
+            branch_tensor->numel(),
+            branch_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
+        // if (branch_tensor) {
+        //   DenseTensor temp_tensor_cpu;
+        //   ctx.template HostAlloc(&temp_tensor_cpu,
+        //                          phi::DataType::FLOAT32,
+        //                          branch.get_ptr()->numel() * sizeof(float));
+        //   memory_utils::Copy(CPUPlace(),
+        //              static_cast<void*>(temp_tensor_cpu.data<float>()),
+        //              ctx.GetPlace(),
+        //              static_cast<void*>(branch_data_temp),
+        //              branch.get_ptr()->numel() * sizeof(float));
+        //   for (size_t i = 0; i < 50; ++i) {
+        //     VLOG(1) << "branch_data_quantize_after[" << i
+        //             << "]:" <<
+        //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
+        //   }
+        // }
+        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
+      } else if (branch_tensor->dtype() == phi::DataType::INT8 &&
+                 out->dtype() == phi::DataType::FLOAT16) {
+        VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && "
+                   "out->dtype() == phi::DataType::FLOAT16";
+        // if (branch_tensor) {
+        //   DenseTensor temp_tensor_cpu;
+        //   ctx.template HostAlloc(&temp_tensor_cpu,
+        //                          branch.get_ptr()->dtype(),
+        //                          branch.get_ptr()->numel() * sizeof(int8_t));
+        //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false,
+        //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
+        //     VLOG(1) << "branch_data_quantize_before[" << i
+        //             << "]:" <<
+        //             static_cast<float>(temp_tensor_cpu.data<int8_t>()[i]);
+        //   }
+        // }
+        auto branch_data_temp =
+            RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(branch_tensor->numel());
+        int r = xpu::dequantization<int8_t, XPUTypeFP16>(
+            ctx.x_context(),
+            reinterpret_cast<const int8_t*>(branch_tensor->data<int8_t>()),
+            branch_data_temp,
+            branch_tensor->numel(),
+            branch_max_data);
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
+        // if (branch_tensor) {
+        //   DenseTensor temp_tensor_cpu;
+        //   ctx.template HostAlloc(&temp_tensor_cpu,
+        //                          phi::DataType::FLOAT16,
+        //                          branch.get_ptr()->numel() *
+        //                          sizeof(dtype::float16));
+        //   memory_utils::Copy(CPUPlace(),
+        //              static_cast<void*>(temp_tensor_cpu.data<dtype::float16>()),
+        //              ctx.GetPlace(),
+        //              static_cast<void*>(branch_data_temp),
+        //              branch.get_ptr()->numel() * sizeof(dtype::float16));
+        //   for (size_t i = 0; i < 50; ++i) {
+        //     VLOG(1) << "branch_data_quantize_after[" << i
+        //             << "]:" <<
+        //             static_cast<float>(temp_tensor_cpu.data<dtype::float16>()[i]);
+        //   }
+        // }
+        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
+      } else {
+        auto branch_data_temp =
+            RAII_GUARD.alloc_l3_or_gm<XPUTypeOut>(branch_tensor->numel());
+        int r = xpu::cast<XPUTypeX, XPUTypeOut>(
+            ctx.x_context(),
+            reinterpret_cast<const XPUTypeX*>(branch_tensor->data<T_X>()),
+            branch_data_temp,
+            branch_tensor->numel());
+        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+        branch_data = branch_data_temp;
+      }
     }
   }
-  const float* branch_max_data = branch_max.get_ptr() == nullptr
-                                     ? nullptr
-                                     : branch_max.get_ptr()->data<float>();
+  VLOG(1) << "KERNEL3";
   const float* bias_data =
       bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
   auto* out_data =
       reinterpret_cast<XPUTypeOut*>(ctx.template Alloc<T_OUT>(out));
   auto* out_max_data = ctx.template Alloc<float>(out_max);
+  out_max_data = out_max_in.get_ptr() != nullptr
+                     ? const_cast<float*>(out_max_in.get_ptr()->data<float>())
+                     : out_max_data;
+  VLOG(1) << "KERNEL4.5";
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
+  VLOG(1) << "KERNEL5";
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
     act.leaky_alpha = act_param;
   } else if (act_type == xpu::Activation_t::HARD_SIGMOID) {
     act.hard_sigmoid_slope = act_param;
   }
+  // if (input_max_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          x_max.get_ptr()->dtype(),
+  //                          x_max.get_ptr()->numel() * sizeof(float));
+  //   phi::Copy(ctx, *x_max.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) {
+  //     VLOG(1) << "input_max_data[" << i
+  //             << "]:" << temp_tensor_cpu.data<float>()[i];
+  //   }
+  // }
+
+  // if (filter_max_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          filter_max.dtype(),
+  //                          filter_max.numel() * sizeof(float));
+  //   phi::Copy(ctx, filter_max, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) {
+  //     VLOG(1) << "filter_max_data[" << i
+  //             << "]:" << temp_tensor_cpu.data<float>()[i];
+  //   }
+  // }
+
+  // if (input_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, x.dtype(), x.numel() * sizeof(T_X));
+  //   phi::Copy(ctx, x, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "input_data[" << i
+  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_X>()[i]);
+  //   }
+  // }
+
+  // if (filter_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W));
+  //   phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "filter_data[" << i
+  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_W>()[i]);
+  //   }
+  // }
+
+  // if (bias_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          bias.get_ptr()->dtype(),
+  //                          bias.get_ptr()->numel() * sizeof(float));
+  //   phi::Copy(ctx, *bias.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "bias_data[" << i << "]:" <<
+  //     temp_tensor_cpu.data<float>()[i];
+  //   }
+  // }
+
+  // if (branch_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          branch.get_ptr()->dtype(),
+  //                          branch.get_ptr()->numel() * sizeof(T_OUT));
+  //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "branch_data[" << i
+  //             << "]:" <<
+  //             static_cast<float>(temp_tensor_cpu.data<T_OUT>()[i]);
+  //   }
+  // }
+
+  // if (branch_max) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          branch_max.get_ptr()->dtype(),
+  //                          branch_max.get_ptr()->numel() * sizeof(float));
+  //   phi::Copy(ctx, *branch_max.get_ptr(), CPUPlace(), false,
+  //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "branch_max_data[" << i
+  //             << "]:" << temp_tensor_cpu.data<float>()[i];
+  //   }
+  // }
+
+  // if (scale_max) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(&temp_tensor_cpu,
+  //                          scale_max.get_ptr()->dtype(),
+  //                          scale_max.get_ptr()->numel() * sizeof(float));
+  //   phi::Copy(ctx, *scale_max.get_ptr(), CPUPlace(), false,
+  //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "scale_max_data[" << i
+  //             << "]:" << temp_tensor_cpu.data<float>()[i];
+  //   }
+  // }
 
+  // if (filter_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W));
+  //   phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "filter_data[" << i
+  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_W>()[i]);
+  //   }
+  // }
+
+  // if (out_max_in.get_ptr()) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, out_max_in.get_ptr()->dtype(),
+  //       out_max_in.get_ptr()->numel() * sizeof(float));
+  //   phi::Copy(ctx, *out_max_in.get_ptr(), CPUPlace(), false,
+  //   &temp_tensor_cpu); for (size_t i = 0; i < out_max_in.get_ptr()->numel();
+  //   ++i) {
+  //     VLOG(1) << "output_max_data_before[" << i
+  //             << "]:" <<
+  //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
+  //   }
+  // }
   int r = xpu::
       conv2d_fusion<XPUTypeX, XPUTypeW, XPUTypeOut, T_GEMM>(  // TX/TW/TY/TGEMM
           /* baidu::xpu::api::Context* ctx */ ctx.x_context(),
@@ -131,8 +381,32 @@ void Conv2dXPUKernelImpl(const Context& ctx,
           /* const TY* branch */ branch_data,
           /* const baidu::xpu::api::Activation_t& act */ act,
           /* const float* branch_maxptr */ branch_max_data,
-          /* const float* scale */ nullptr);
+          /* const float* scale */ scale_max_data);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu");
+  // if (out_data) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, out->dtype(), out->numel() * sizeof(T_OUT));
+  //   phi::Copy(ctx, *out, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "output_data[" << i
+  //             << "]:" <<
+  //             static_cast<float>(temp_tensor_cpu.data<T_OUT>()[i]);
+  //   }
+  // }
+
+  // if (out_max) {
+  //   DenseTensor temp_tensor_cpu;
+  //   ctx.template HostAlloc(
+  //       &temp_tensor_cpu, out_max->dtype(), out_max->numel() *
+  //       sizeof(float));
+  //   phi::Copy(ctx, *out_max, CPUPlace(), false, &temp_tensor_cpu);
+  //   for (size_t i = 0; i < 50; ++i) {
+  //     VLOG(1) << "output_max_data_after[" << i
+  //             << "]:" <<
+  //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
+  //   }
+  // }
 }
 
 #define CONV2D_XPU_KERNEL_IMPL(x_dtype_, w_dtype_, out_dtype_, gemm_dtype_)  \
@@ -145,6 +419,8 @@ void Conv2dXPUKernelImpl(const Context& ctx,
       bias,                                                                  \
       branch,                                                                \
       branch_max,                                                            \
+      scale_max,                                                             \
+      out_max_in,                                                            \
       paddings,                                                              \
       dilations,                                                             \
       strides,                                                               \
@@ -164,6 +440,8 @@ void Conv2dXPUKernel(const Context& ctx,
                      const paddle::optional<DenseTensor>& bias,
                      const paddle::optional<DenseTensor>& branch,
                      const paddle::optional<DenseTensor>& branch_max,
+                     const paddle::optional<DenseTensor>& scale_max,
+                     const paddle::optional<DenseTensor>& out_max_in,
                      const std::vector<int>& paddings,
                      const std::vector<int>& dilations,
                      const std::vector<int>& strides,
@@ -174,14 +452,118 @@ void Conv2dXPUKernel(const Context& ctx,
                      DataType out_dtype,
                      DenseTensor* out,
                      DenseTensor* out_max) {
-  if (out_dtype == DataType::FLOAT32) {
-    CONV2D_XPU_KERNEL_IMPL(T, int16_t, float, int16_t);
-  } else if (out_dtype == DataType::FLOAT16) {
-    CONV2D_XPU_KERNEL_IMPL(T, int16_t, dtype::float16, int16_t);
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented("Not support out_dtype is %s.",
-                                            DataTypeToString(out_dtype)));
+  // Dont use template T param
+  VLOG(1) << "Kernel type: " << x.dtype() << "," << filter.dtype() << " ,"
+          << out_dtype;
+  if (x.dtype() == DataType::FLOAT32) {
+    // float32/float16 kernel
+    if (filter.dtype() == DataType::INT16) {
+      if (out_dtype == DataType::FLOAT32) {
+        CONV2D_XPU_KERNEL_IMPL(float, int16_t, float, int16_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        CONV2D_XPU_KERNEL_IMPL(float, int16_t, dtype::float16, int16_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, filter_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(filter.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else if (filter.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT32) {
+        CONV2D_XPU_KERNEL_IMPL(float, int8_t, float, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        CONV2D_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, filter_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(filter.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(filter.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
   }
+
+  if (x.dtype() == DataType::FLOAT16) {
+    // float16 kernel
+    if (filter.dtype() == DataType::INT16) {
+      if (out_dtype == DataType::FLOAT32) {
+        CONV2D_XPU_KERNEL_IMPL(phi::dtype::float16, int16_t, float, int16_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        CONV2D_XPU_KERNEL_IMPL(
+            phi::dtype::float16, int16_t, dtype::float16, int16_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, filter_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(filter.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else if (filter.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT16) {
+        CONV2D_XPU_KERNEL_IMPL(
+            phi::dtype::float16, int8_t, dtype::float16, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        CONV2D_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, filter_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(filter.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(filter.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
+  }
+
+  if (x.dtype() == DataType::INT8) {
+    if (filter.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT32) {
+        CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, float, int8_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, dtype::float16, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        CONV2D_XPU_KERNEL_IMPL(int8_t, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, filter_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(filter.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(filter.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
+  }
+
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not support x_dtype is %s, filter_dtype is %s and out_dtype is %s.",
+      DataTypeToString(x.dtype()),
+      DataTypeToString(filter.dtype()),
+      DataTypeToString(out_dtype)));
 }
 
 }  // namespace fusion
@@ -192,4 +574,5 @@ PD_REGISTER_KERNEL(conv2d_xpu,
                    ALL_LAYOUT,
                    phi::fusion::Conv2dXPUKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   int8_t) {}
diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
index 6a6721194e9a8..f2acd0893a6f7 100644
--- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "glog/logging.h"
 #include "paddle/phi/backends/xpu/enforce_xpu.h"
 #include "paddle/phi/core/kernel_registry.h"
 
@@ -29,6 +30,8 @@ void FcXPUKernelImpl(const Context& ctx,
                      const DenseTensor& w,
                      const DenseTensor& w_max,
                      const paddle::optional<DenseTensor>& bias,
+                     const paddle::optional<DenseTensor>& scale_max,
+                     const paddle::optional<DenseTensor>& out_max_in,
                      int in_num_col_dims,
                      bool transpose_x,
                      float alpha,
@@ -53,7 +56,13 @@ void FcXPUKernelImpl(const Context& ctx,
       bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
   auto* out_data =
       reinterpret_cast<XPUTypeOut*>(ctx.template Alloc<T_OUT>(out));
+  auto* scale_max_data = scale_max.get_ptr() == nullptr
+                             ? nullptr
+                             : scale_max.get_ptr()->data<float>();
   auto* out_max_data = ctx.template Alloc<float>(out_max);
+  out_max_data = out_max_in.get_ptr() != nullptr
+                     ? const_cast<float*>(out_max_in.get_ptr()->data<float>())
+                     : out_max_data;
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
     act.leaky_alpha = act_alpha;
@@ -80,7 +89,9 @@ void FcXPUKernelImpl(const Context& ctx,
           alpha,                                               // alpha
           beta,                                                // beta
           bias_data,                                           // bias
-          act);
+          act,                                                 // act
+          scale_max_data);                                     // scale
+
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu");
 }
 
@@ -92,6 +103,8 @@ void FcXPUKernelImpl(const Context& ctx,
       w,                                                                \
       w_max,                                                            \
       bias,                                                             \
+      scale_max,                                                        \
+      out_max_in,                                                       \
       in_num_col_dims,                                                  \
       transpose_x,                                                      \
       alpha,                                                            \
@@ -108,6 +121,8 @@ void FcXPUKernel(const Context& ctx,
                  const DenseTensor& w,
                  const DenseTensor& w_max,
                  const paddle::optional<DenseTensor>& bias,
+                 const paddle::optional<DenseTensor>& scale_max,
+                 const paddle::optional<DenseTensor>& out_max_in,
                  int in_num_col_dims,
                  bool transpose_x,
                  float alpha,
@@ -117,14 +132,117 @@ void FcXPUKernel(const Context& ctx,
                  DataType out_dtype,
                  DenseTensor* out,
                  DenseTensor* out_max) {
-  if (out_dtype == DataType::FLOAT32) {
-    FC_XPU_KERNEL_IMPL(T, int16_t, float, int16_t);
-  } else if (out_dtype == DataType::FLOAT16) {
-    FC_XPU_KERNEL_IMPL(T, int16_t, dtype::float16, int16_t);
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented("Not support out_dtype is %s.",
-                                            DataTypeToString(out_dtype)));
+  // Dont use template T param
+  VLOG(1) << "Kernel type: " << x.dtype() << "," << w.dtype() << " ,"
+          << out_dtype;
+  if (x.dtype() == DataType::FLOAT32) {
+    // float32/float16 kernel
+    if (w.dtype() == DataType::INT16) {
+      if (out_dtype == DataType::FLOAT32) {
+        FC_XPU_KERNEL_IMPL(float, int16_t, float, int16_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        FC_XPU_KERNEL_IMPL(float, int16_t, dtype::float16, int16_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, w_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(w.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else if (w.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT32) {
+        FC_XPU_KERNEL_IMPL(float, int8_t, float, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        FC_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, w_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(w.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(w.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
+  }
+
+  if (x.dtype() == DataType::FLOAT16) {
+    // float16 kernel
+    if (w.dtype() == DataType::INT16) {
+      if (out_dtype == DataType::FLOAT32) {
+        FC_XPU_KERNEL_IMPL(phi::dtype::float16, int16_t, float, int16_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        FC_XPU_KERNEL_IMPL(
+            phi::dtype::float16, int16_t, dtype::float16, int16_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, w_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(w.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else if (w.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT16) {
+        FC_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, dtype::float16, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        FC_XPU_KERNEL_IMPL(phi::dtype::float16, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, w_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(w.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(w.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
   }
+
+  if (x.dtype() == DataType::INT8) {
+    if (w.dtype() == DataType::INT8) {
+      if (out_dtype == DataType::FLOAT32) {
+        FC_XPU_KERNEL_IMPL(int8_t, int8_t, float, int8_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        FC_XPU_KERNEL_IMPL(int8_t, int8_t, dtype::float16, int8_t);
+      } else if (out_dtype == DataType::INT8) {
+        FC_XPU_KERNEL_IMPL(int8_t, int8_t, int8_t, int8_t);
+      } else {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Not support x_dtype is %s, w_dtype is %s and out_dtype is "
+            "%s.",
+            DataTypeToString(x.dtype()),
+            DataTypeToString(w.dtype()),
+            DataTypeToString(out_dtype)));
+      }
+    } else {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.",
+          DataTypeToString(x.dtype()),
+          DataTypeToString(w.dtype()),
+          DataTypeToString(out_dtype)));
+    }
+    return;
+  }
+
+  PADDLE_THROW(phi::errors::Unimplemented(
+      "Not support x_dtype is %s, w_dtype is %s and out_dtype is %s.",
+      DataTypeToString(x.dtype()),
+      DataTypeToString(w.dtype()),
+      DataTypeToString(out_dtype)));
 }
 
 }  // namespace fusion
@@ -135,4 +253,5 @@ PD_REGISTER_KERNEL(fc_xpu,
                    ALL_LAYOUT,
                    phi::fusion::FcXPUKernel,
                    float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   int8_t) {}

From 9483e72100f906d901a970090464bfee81196ad8 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Wed, 20 Sep 2023 11:05:54 +0800
Subject: [PATCH 02/15] support fc_xpu int8

---
 .../auto_trans_quantize_op_precision_pass.cc  | 54 +++++++++----------
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 12 +++--
 .../framework/ir/xpu/link_xpu_op_max_pass.cc  | 15 +++++-
 paddle/fluid/framework/ir/xpu/quant_utils.cc  |  5 +-
 .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc   | 27 ++++++++++
 .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc   |  4 +-
 6 files changed, 80 insertions(+), 37 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
index c8b4b7c040f7e..9fec1091bd9a9 100644
--- a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
@@ -44,7 +44,7 @@ class AutoTransQuantizeOpPrecisionPass : public FusePassBase {
 
   const std::string name_scope_{"auto_trans_quantize_op_precision_pass"};
   const std::unordered_set<std::string> support_fusion_quant_op_type_{
-      "conv2d_xpu"};
+      "conv2d_xpu", "fc_xpu"};
 };
 
 static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph,
@@ -72,35 +72,33 @@ void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const {
         bool enable_int8 = op_node->Op()->GetAttrIfExists<bool>("enable_int8");
         int out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
         if (enable_int8) {
-          if (op_type == "conv2d_xpu") {
-            auto* out_var_node =
-                GetOpOutVarNodeByArgsName(subgraph, op_node, "out");
-            PADDLE_ENFORCE_NOT_NULL(
-                out_var_node,
-                platform::errors::InvalidArgument(
-                    "out_var_node in graph cannot be nullptr."));
-            bool is_int8_out = true;
-            for (auto* next_op_node : out_var_node->outputs) {
-              auto next_op_type = next_op_node->Op()->Type();
-              bool is_next_op_support_int8 =
-                  next_op_node->Op()->GetAttrIfExists<bool>("enable_int8") &&
-                  ((support_fusion_quant_op_type_.find(next_op_type) !=
-                    support_fusion_quant_op_type_.end()));
-              if (!is_next_op_support_int8) {
-                is_int8_out = false;
-                break;
-              }
-            }
-            if (is_int8_out) {
-              op_node->Op()->SetAttr(
-                  "out_dtype",
-                  static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
-              out_var_node->Var()->SetDataType(
-                  proto::VarType::Type::VarType_Type_INT8);
-              VLOG(1) << "The out var node " << out_var_node->Name()
-                      << " is INT8";
+          auto* out_var_node =
+              GetOpOutVarNodeByArgsName(subgraph, op_node, "out");
+          PADDLE_ENFORCE_NOT_NULL(
+              out_var_node,
+              platform::errors::InvalidArgument(
+                  "out_var_node in graph cannot be nullptr."));
+          bool is_int8_out = true;
+          for (auto* next_op_node : out_var_node->outputs) {
+            auto next_op_type = next_op_node->Op()->Type();
+            bool is_next_op_support_int8 =
+                next_op_node->Op()->GetAttrIfExists<bool>("enable_int8") &&
+                ((support_fusion_quant_op_type_.find(next_op_type) !=
+                  support_fusion_quant_op_type_.end()));
+            if (!is_next_op_support_int8) {
+              is_int8_out = false;
+              break;
             }
           }
+          if (is_int8_out) {
+            op_node->Op()->SetAttr(
+                "out_dtype",
+                static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+            out_var_node->Var()->SetDataType(
+                proto::VarType::Type::VarType_Type_INT8);
+            VLOG(1) << "The out var node " << out_var_node->Name()
+                    << " is INT8";
+          }
         }
       }
     }
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 5868db5627021..f087b7caf20ab 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -367,8 +367,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
   }
   // Get Weight scale in int8 scene
   std::vector<float> weight_scale =
-      mul->Op()->GetAttrIfExists<std::vector<float>>("Input_scale_" +
-                                                     mul_w->Name());
+      mul->Op()->GetAttrIfExists<std::vector<float>>("weight_scale");
   // Create fusion_bias_node
   auto filter_dims = filter_t->dims();
   bool has_bias = with_bn || with_bias;
@@ -754,8 +753,11 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
     GET_IR_NODE(act);
     GET_IR_NODE(act_out);
     std::map<std::string, std::map<std::string, Node*>> nodes_map;
-    nodes_map.insert(
-        {"mul", {{"mul_x", mul_x}, {"mul_w", mul_w}, {"mul_out", mul_out}}});
+    nodes_map.insert({"mul",
+                      {{"mul", mul},
+                       {"mul_x", mul_x},
+                       {"mul_w", mul_w},
+                       {"mul_out", mul_out}}});
     nodes_map.insert({"ew_bias_add",
                       {{"ew_bias_add", add},
                        {"ew_bias_add_bias", bias},
@@ -785,7 +787,7 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
 
     bool enable_int8 = mul->Op()->GetAttrIfExists<bool>("enable_int8");
     std::string op_precision_str = enable_int8 ? "int8" : "fp32";
-    VLOG(4) << "FC fusion fuse pass is running on " << op_precision_str
+    VLOG(1) << "FC fusion fuse pass is running on " << op_precision_str
             << " precision!";
     auto* block = mul->Op()->Block();
     CreateFusionWeightsAndBias(graph,
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index 3a6d29f794d65..d9ab5448d0fda 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -106,7 +106,13 @@ struct LinkFcPattern : public PatternBase {
 
 LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope)
     : PatternBase(pattern, name_scope, name_scope) {
-  auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu");
+  auto* fusion_op = pattern->NewNode(fusion_op_repr())
+                        ->assert_is_op("fc_xpu")
+                        ->assert_more([&](Node* node) {
+                          bool enable_int8 =
+                              node->Op()->GetAttrIfExists<bool>("enable_int8");
+                          return !enable_int8;
+                        });
   auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x");
 
   fusion_op->LinksFrom({x});
@@ -231,7 +237,12 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const {
       auto preop_max_var_name = x_pre_op->Output("out_max");
       for (auto max_node : x->inputs[0]->outputs) {
         if (preop_max_var_name[0] == max_node->Name()) {
-          fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          if (fusion_op_desc->HasInput("x_max")) {
+            auto x_max_old_name = fusion_op_desc->Input("x_max")[0];
+            fusion_op_desc->RenameInput(x_max_old_name, max_node->Name());
+          } else {
+            fusion_op_desc->SetInput("x_max", {max_node->Name()});
+          }
           IR_NODE_LINK_TO(max_node, fusion_op);
         }
       }
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc
index ada4a4b9b6c2f..90ca41f72958e 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc
@@ -64,9 +64,12 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out) {
     case phi::DataType::FLOAT32:
       phi::TransposeKernel<float>(*cpu_ctx, *in, axis, out_ptr);
       break;
+    case phi::DataType::INT8:
+      phi::TransposeKernel<int8_t>(*cpu_ctx, *in, axis, out_ptr);
+      break;
     default:
       PADDLE_THROW(platform::errors::InvalidArgument(
-          "Only support fp16 and fp32, but received dtype is %s.",
+          "Only support fp16/fp32/int8, but received dtype is %s.",
           phi::DataTypeToString(in->dtype())));
       break;
   }
diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
index 8383501c30b8f..fff3c4020b544 100644
--- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
@@ -286,6 +286,33 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const {
     desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x"));
     desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y"));
     desc.SetAttr("alpha", 1.0f);
+    if (matmul_v2->Op()->HasAttr("enable_int8")) {
+      desc.SetAttr("enable_int8", matmul_v2->Op()->GetAttr("enable_int8"));
+    }
+    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_x->Name())) {
+      desc.SetAttr("Input_scale_" + matmul_x->Name(),
+                   matmul_v2->Op()->GetAttr("Input_scale_" + matmul_x->Name()));
+    }
+    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_y->Name())) {
+      desc.SetAttr("Input_scale_" + matmul_y->Name(),
+                   matmul_v2->Op()->GetAttr("Input_scale_" + matmul_y->Name()));
+    }
+    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_out->Name())) {
+      desc.SetAttr(
+          "Input_scale_" + matmul_out->Name(),
+          matmul_v2->Op()->GetAttr("Input_scale_" + matmul_out->Name()));
+    }
+    if (matmul_v2->Op()->HasAttr("weight_scale")) {
+      desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale"));
+    }
+    if (matmul_v2->Op()->HasAttr("weight_bit_length")) {
+      desc.SetAttr("weight_bit_length",
+                   matmul_v2->Op()->GetAttr("weight_bit_length"));
+    }
+    if (matmul_v2->Op()->HasAttr("weight_quant_axis")) {
+      desc.SetAttr("weight_quant_axis",
+                   matmul_v2->Op()->GetAttr("weight_quant_axis"));
+    }
     if (matmul_v2->Op()->HasAttr("use_mkldnn")) {
       desc.SetAttr("use_mkldnn", matmul_v2->Op()->GetAttr("use_mkldnn"));
     }
diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
index f2acd0893a6f7..eeb36a86eeec7 100644
--- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
@@ -133,7 +133,7 @@ void FcXPUKernel(const Context& ctx,
                  DenseTensor* out,
                  DenseTensor* out_max) {
   // Dont use template T param
-  VLOG(1) << "Kernel type: " << x.dtype() << "," << w.dtype() << " ,"
+  VLOG(1) << "Kernel type: " << x.dtype() << " ," << w.dtype() << " ,"
           << out_dtype;
   if (x.dtype() == DataType::FLOAT32) {
     // float32/float16 kernel
@@ -155,6 +155,8 @@ void FcXPUKernel(const Context& ctx,
         FC_XPU_KERNEL_IMPL(float, int8_t, float, int8_t);
       } else if (out_dtype == DataType::INT8) {
         FC_XPU_KERNEL_IMPL(float, int8_t, int8_t, int8_t);
+      } else if (out_dtype == DataType::FLOAT16) {
+        FC_XPU_KERNEL_IMPL(float, int8_t, dtype::float16, int8_t);
       } else {
         PADDLE_THROW(phi::errors::Unimplemented(
             "Not support x_dtype is %s, w_dtype is %s and out_dtype is "

From 3ab34c63594ee6eeaa5c6db1e1d66bc122bdae14 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Mon, 9 Oct 2023 16:11:50 +0800
Subject: [PATCH 03/15] support quantize of pass

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   8 +-
 .../ir/delete_quant_dequant_linear_op_pass.cc |  12 +
 .../ir/quantize_related_pass_utils.h          |  84 +++++
 paddle/fluid/framework/ir/xpu/pass_utils.h    |  18 +
 .../ir/xpu/xpu_graph_pattern_detector.cc      | 128 +++++++
 .../ir/xpu/xpu_graph_pattern_detector.h       |  96 ++++++
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  | 275 +++++++++++++++
 .../framework/ir/xpu/xpu_quantize_op_pass.h   |  65 ++++
 .../ir/xpu/xpu_quantize_squash_pass.cc        | 312 ++++++++++++++++++
 .../ir/xpu/xpu_quantize_squash_pass.h         | 110 ++++++
 .../inference/api/paddle_pass_builder.cc      |   6 +-
 paddle/phi/api/yaml/ops.yaml                  |  20 ++
 paddle/phi/backends/xpu/xpu2_op_list.cc       |   4 +
 paddle/phi/infermeta/binary.cc                |  20 ++
 paddle/phi/infermeta/binary.h                 |  12 +
 .../phi/kernels/xpu/dequantization_kernel.cc  |  66 ++++
 paddle/phi/kernels/xpu/quantization_kernel.cc |  70 ++++
 17 files changed, 1303 insertions(+), 3 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/quantize_related_pass_utils.h
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
 create mode 100644 paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
 create mode 100644 paddle/phi/kernels/xpu/dequantization_kernel.cc
 create mode 100644 paddle/phi/kernels/xpu/quantization_kernel.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e9a8e4cc22cac..42e9a1267e0ee 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -237,7 +237,11 @@ if(WITH_XPU)
     xpu_pass_utils
     SRCS xpu/pass_utils.cc
     DEPS pass xpu_quant_utils)
-  set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils)
+  cc_library(
+    xpu_graph_pattern_detector
+    SRCS xpu/xpu_graph_pattern_detector.cc
+    DEPS graph_pattern_detector)
+  set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector)
   pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
@@ -247,6 +251,8 @@ if(WITH_XPU)
   # pass_library(conv1d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(xpu_quantize_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
+  pass_library(xpu_quantize_squash_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 42c7f7acdc103..9245305889907 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -19,6 +19,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -94,6 +95,8 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       scope,
       platform::errors::InvalidArgument(
           "Scope in DeleteQuantDequantLinearOpPass should not be null."));
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales{};
+
   // Create pattern
   patterns::DeleteQuantDequantLinearOpPattern pattern(gpd.mutable_pattern(),
                                                       pattern_name);
@@ -146,6 +149,11 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       any_op_desc->SetAttr(
           "Input_bit_length_" + quantize_linear_op_x->Var()->Name(),
           bit_length);
+      if (!var_quant_scales.count(quantize_linear_op_x->Var()->Name())) {
+        var_quant_scales.insert(
+            std::make_pair(quantize_linear_op_x->Var()->Name(),
+                           std::vector<float>({input_scale})));
+      }
       // link x to any_op2
       any_op_desc->RenameInput(dequantize_linear_op_out->Var()->Name(),
                                quantize_linear_op_x->Var()->Name());
@@ -165,6 +173,10 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   };
   gpd(graph, handler);
   AddStatis(found_count);
+
+  // save var_quant_scales in the temporary save op's attr01
+  SaveInfoInTheTmpOp(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
new file mode 100644
index 0000000000000..ce97cdd5fee33
--- /dev/null
+++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
@@ -0,0 +1,84 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static void SaveInfoInTheTmpOp(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map) {
+  VLOG(3) << "save variables in the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  OpDesc op_desc;
+  op_desc.SetType("save");
+  auto* op_node = graph->CreateOpNode(&op_desc);
+
+  op_node->Op()->SetAttr(flag, true);
+  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+    op_node->Op()->SetAttr(iter->first + suffix, iter->second);
+  }
+}
+
+static void GetInfoFromTheTmpOp(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    std::unordered_map<std::string, std::vector<float>>* info_map) {
+  VLOG(3) << "get variables from the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue;
+    VLOG(5) << "Come in save op";
+    auto* op_desc = op_node->Op();
+    if (op_desc->GetAttrIfExists<bool>(flag)) {
+      VLOG(5) << "flag is true";
+      op_desc->RemoveAttr(flag);
+      std::vector<std::string> attr_names = op_desc->AttrNames();
+      for (auto fake_name : attr_names) {
+        VLOG(5) << "fake_name:" << fake_name;
+        size_t pos = fake_name.find(suffix);
+        if (pos != std::string::npos) {
+          std::string name = fake_name.substr(0, pos);
+          VLOG(5) << "name:" << name;
+          auto scales_vector =
+              PADDLE_GET_CONST(std::vector<float>, op_desc->GetAttr(fake_name));
+          VLOG(5) << "scales_vector:" << scales_vector[0];
+          info_map->insert(std::make_pair(name, scales_vector));
+          VLOG(5) << "insert success:";
+          op_desc->RemoveAttr(fake_name);
+          VLOG(5) << "remove success:";
+        }
+      }
+      graph->RemoveNode(op_node);
+      VLOG(5) << "remove op node success:";
+      break;
+    }
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h
index 556a14fa0e9e4..417ba361e4348 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.h
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.h
@@ -101,6 +101,24 @@ void PrepareWeight(Graph* graph,
 void PrepareBias(
     Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst);
 
+inline std::string FindOutputNameByVarName(framework::OpDesc* op,
+                                           const std::string& searched_name) {
+  std::string ret;
+  for (const auto& name : op->OutputNames())
+    for (const auto& output_name : op->Output(name))
+      if (output_name == searched_name) ret = name;
+  return ret;
+}
+
+inline std::string FindInputNameByVarName(framework::OpDesc* op,
+                                          const std::string& searched_name) {
+  std::string ret;
+  for (const auto& name : op->InputNames())
+    for (const auto& input_name : op->Input(name))
+      if (input_name == searched_name) ret = name;
+  return ret;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
new file mode 100644
index 0000000000000..f74f9c8289d65
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
@@ -0,0 +1,128 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+PDNode *patterns::DequantXPUAny::operator()() {
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize_xpu", "y");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksTo({dequant_out});
+  next_op->LinksFrom({dequant_out});
+
+  return dequant_out;
+}
+
+PDNode *patterns::QuantXPUAny::operator()() {
+  auto *quant_in = pattern->NewNode(quant_in_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("quantize_xpu", "x");
+  auto *quant_op =
+      pattern->NewNode(quant_op_repr())->assert_is_op("quantize_xpu");
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("quantize_xpu", "y");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  quant_op->LinksFrom({quant_in}).LinksTo({quant_out});
+  next_op->LinksFrom({quant_out});
+
+  return quant_out;
+}
+
+PDNode *patterns::DequantQuantXPUAny::operator()() {
+  auto *dequant_in = pattern->NewNode(dequant_in_repr())
+                         ->AsInput()
+                         ->assert_is_op_input("dequantize_xpu", "x");
+  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("dequantize_xpu", "max");
+
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
+
+  auto *dequant_out = pattern->NewNode(dequant_out_repr())
+                          ->AsOutput()
+                          ->assert_is_op_output("dequantize_xpu", "y");
+
+  auto *quant_max_in = pattern->NewNode(quant_max_in_repr())
+                           ->assert_is_op_input("quantize_xpu", "max");
+
+  auto *quant_op = pattern->NewNode(quant_op_repr())
+                       ->assert_is_op("quantize_xpu")
+                       ->AsIntermediate();
+
+  auto *quant_out = pattern->NewNode(quant_out_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("quantize_xpu");
+
+  auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+
+  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out, quant_max_in}).LinksTo({quant_out});
+  next_op->LinksFrom({quant_out});
+
+  return quant_out;
+}
+
+PDNode *patterns::OpDequantXPU::operator()() {
+  auto any_op = pattern->NewNode(any_op_repr())->assert_is_op();
+  auto *dequant_in = pattern->NewNode(dequant_in_repr())
+                         ->assert_is_op_input("dequantize_xpu", "x");
+
+  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
+                             ->AsInput()
+                             ->assert_is_op_input("dequantize_xpu", "max");
+  auto *dequant_op =
+      pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
+  auto dequant_out = pattern->NewNode(dequant_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("dequantize_xpu", "y");
+
+  any_op->LinksTo({dequant_in});
+  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
+  return dequant_out;
+}
+
+PDNode *patterns::MultipleQuantizeXPU::operator()() {
+  auto *prev_out = pattern->NewNode(prev_out_repr())->AsOutput();
+
+  // find nodes that are inputs to quantize operators
+  prev_out->assert_more([&](Node *node) {
+    int counter = static_cast<int>(std::count_if(
+        node->outputs.begin(), node->outputs.end(), [&](Node const *iter) {
+          return iter && iter->IsOp() && iter->Op()->Type() == "quantize_xpu";
+        }));
+    return (counter > 1);
+  });
+
+  return prev_out;
+}
+
+}  // namespace patterns
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h
new file mode 100644
index 0000000000000..c849b2a24bb48
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+// Dequantize + anyOP
+// This quantize is used for getting number of ops the Dequantize's
+// output is an input to.
+struct DequantXPUAny : public PatternBase {
+  DequantXPUAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_xpu_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Quantize + anyOP
+struct QuantXPUAny : public PatternBase {
+  QuantXPUAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "quant_xpu_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(quant_in);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Dequantize + Quantize + anyOP
+// This pattern is used for squashing the dequantize-quantize pairs.
+struct DequantQuantXPUAny : public PatternBase {
+  DequantQuantXPUAny(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "dequant_quant_xpu_any") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(dequant_in);
+  PATTERN_DECL_NODE(dequant_max_in);
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+  PATTERN_DECL_NODE(quant_max_in);
+  PATTERN_DECL_NODE(quant_op);
+  PATTERN_DECL_NODE(quant_out);
+  PATTERN_DECL_NODE(next_op);
+};
+
+// Op + Dequant
+// named nodes:
+// any_op, dequant_in
+// dequant_op, dequant_out
+struct OpDequantXPU : public PatternBase {
+  OpDequantXPU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "op_dequant_xpu") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(any_op);
+  PATTERN_DECL_NODE(dequant_in);
+  PATTERN_DECL_NODE(dequant_max_in);
+  PATTERN_DECL_NODE(dequant_op);
+  PATTERN_DECL_NODE(dequant_out);
+};
+
+// anyOp + more then one quantize op
+// This pattern is used for squashing multiple quantize with the same scale.
+struct MultipleQuantizeXPU : public PatternBase {
+  MultipleQuantizeXPU(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "multiple_quantize_xpu") {}
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(prev_out);
+};
+
+}  // namespace patterns
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
new file mode 100644
index 0000000000000..a8fc7102a8d88
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -0,0 +1,275 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h"
+
+#include <sstream>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/utils/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const {
+  GetInfoFromTheTmpOp(
+      graph,
+      "has_quant_info",
+      "var_quant_scales",
+      const_cast<std::unordered_map<std::string, std::vector<float>>*>(
+          &var_quant_scales_));
+}
+
+bool XPUQuantizeOpPass::AreScalesPresentForNodes(
+    std::initializer_list<Node*> nodes) const {
+  bool present = true;
+  for (auto node : nodes) {
+    if (var_quant_scales_.count(node->Name()) == 0) {
+      present = false;
+    }
+  }
+  return present;
+}
+
+float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const {
+  return var_quant_scales_.at(node->Name())[0];
+}
+
+void XPUQuantizeOpPass::QuantizeInput(Graph* g,
+                                      Node* op,
+                                      Node* input,
+                                      std::string input_arg_name) const {
+  auto* scope = param_scope();
+  auto inputs = op->Op()->InputNames();
+  bool name_found =
+      std::find(inputs.begin(), inputs.end(), input_arg_name) != inputs.end();
+  PADDLE_ENFORCE_EQ(name_found,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) isn't the input of the %s operator.",
+                        input_arg_name,
+                        op->Op()->Type()));
+
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+  quantize_out_node->Var()->SetDataType(
+      proto::VarType::Type::VarType_Type_INT8);
+  // Create quantize max_ptr node
+
+  float scale = GetScaleValueForNode(input);
+  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+  std::string input_max_name = input->Name() + "_quantize_max";
+  VarDesc input_max_desc(input_max_name);
+  input_max_desc.SetPersistable(
+      true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
+              // device
+  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+  Node* input_max_node = g->CreateVarNode(&input_max_desc);
+  auto input_max_tensor =
+      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
+  input_max_tensor->set_type(phi::DataType::FLOAT32);
+  input_max_tensor->Resize({max_ptr_size});
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(
+      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+  std::vector<float> input_scales(max_ptr_size, scale);
+  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
+         input_scales.data(),
+         max_ptr_size * sizeof(float));
+
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize_xpu");
+  q_desc.SetInput("x", std::vector<std::string>({input->Name()}));
+  q_desc.SetInput("max", std::vector<std::string>({input_max_name}));
+  q_desc.SetOutput("y", std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("out_dtype",
+                 static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+  q_desc.SetAttr("scale", static_cast<float>(scale));
+
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+  // update op's input
+  op->Op()->SetInput(input_arg_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(input_max_node, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+}
+
+void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
+                                         Node* op,
+                                         Node* output,
+                                         std::string output_arg_name) const {
+  auto* scope = param_scope();
+  auto outputs = op->Op()->OutputNames();
+  bool name_found =
+      std::find(outputs.begin(), outputs.end(), output_arg_name) !=
+      outputs.end();
+  PADDLE_ENFORCE_EQ(name_found,
+                    true,
+                    platform::errors::InvalidArgument(
+                        "Var(%s) isn't the output of the %s operator.",
+                        output_arg_name,
+                        op->Op()->Type()));
+
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+  dequantize_in_node->Var()->SetDataType(
+      proto::VarType::Type::VarType_Type_INT8);
+
+  // Create dequantize max_ptr node
+  float scale = GetScaleValueForNode(output);
+  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
+  std::string input_max_name = output->Name() + "_dequantize_max";
+  VarDesc input_max_desc(input_max_name);
+  input_max_desc.SetPersistable(
+      true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
+              // device
+  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
+  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+  Node* input_max_node = g->CreateVarNode(&input_max_desc);
+  auto input_max_tensor =
+      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
+  input_max_tensor->set_type(phi::DataType::FLOAT32);
+  input_max_tensor->Resize({max_ptr_size});
+  auto* cpu_ctx = static_cast<phi::CPUContext*>(
+      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
+  std::vector<float> input_scales(max_ptr_size, scale);
+  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
+         input_scales.data(),
+         max_ptr_size * sizeof(float));
+
+  // create a quantize op node
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize_xpu");
+  deq_desc.SetInput("x",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetInput("max", std::vector<std::string>({input_max_name}));
+  deq_desc.SetOutput("y", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("out_dtype", static_cast<int>(output->Var()->GetDataType()));
+  deq_desc.SetAttr("scale", static_cast<float>(scale));
+
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+  // update op's input
+  op->Op()->SetOutput(output_arg_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(input_max_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+}
+
+void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
+  for (auto* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() != "conv2d_xpu") {
+        continue;
+      }
+      Node* w_var_node = nullptr;
+      Node* x_var_node = nullptr;
+      Node* out_var_node = nullptr;
+      Node* branch_var_node = nullptr;
+
+      for (auto* input_node : n->inputs) {
+        if (!input_node->IsVar()) {
+          continue;
+        }
+        if (input_node->Var()->Name() == op->Input("x")[0]) {
+          x_var_node = input_node;
+        } else if (input_node->Var()->Name() == op->Input("filter")[0]) {
+          w_var_node = input_node;
+        } else if (op->HasInput("branch") &&
+                   input_node->Var()->Name() == op->Input("branch")[0]) {
+          branch_var_node = input_node;
+        }
+      }
+
+      for (auto* output_node : n->outputs) {
+        if (!output_node->IsVar()) {
+          continue;
+        }
+        if (output_node->Var()->Name() == op->Output("out")[0]) {
+          out_var_node = output_node;
+        }
+      }
+      if (!AreScalesPresentForNodes({x_var_node})) {
+        // MarkAndLogCannotQuantizeOp(conv_op,
+        //                        "No scale available for the operator");
+        return;
+      }
+
+      QuantizeInput(graph, n, x_var_node, "x");
+      // Branch input
+      if (branch_var_node != nullptr) {
+        if (AreScalesPresentForNodes({branch_var_node})) {
+          QuantizeInput(graph, n, branch_var_node, "branch");
+        } else {
+          n->Op()->SetAttr("xpu_op_force_output_precision",
+                           branch_var_node->Var()->GetDataType());
+        }
+      }
+
+      auto has_output_scale = AreScalesPresentForNodes({out_var_node});
+      if (has_output_scale) {
+        DequantizeOutput(graph, n, out_var_node, "out");
+        n->Op()->SetAttr(
+            "out_dtype",
+            static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+      } else {
+        n->Op()->SetAttr("xpu_op_force_output_precision",
+                         x_var_node->Var()->GetDataType());
+        n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
+      }
+    }
+  }
+}
+
+void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const {
+  VLOG(3) << "Insert quantize/dequantize op to the graph.";
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(name_scope_, graph);
+  PADDLE_ENFORCE_NOT_NULL(
+      param_scope(),
+      platform::errors::InvalidArgument("Scope cannot be nullptr."));
+
+  GetQuantInfo(graph);
+  VLOG(1) << "Get quant info from graph success.";
+  QuantizeConv(graph);
+  VLOG(1) << "Quantize conv of the graph success.";
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(xpu_quantize_op_pass, paddle::framework::ir::XPUQuantizeOpPass);
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
new file mode 100644
index 0000000000000..0b74682009351
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Quantize all supported operators.
+ */
+class XPUQuantizeOpPass : public FusePassBase {
+ public:
+  virtual ~XPUQuantizeOpPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+  void QuantizeConv(Graph* graph) const;
+
+ private:
+  void QuantizeInput(Graph* g,
+                     Node* op,
+                     Node* input,
+                     std::string input_arg_name) const;
+
+  void DequantizeOutput(Graph* g,
+                        Node* op,
+                        Node* output,
+                        std::string output_arg_name) const;
+
+  void GetQuantInfo(Graph* graph) const;
+
+  bool AreScalesPresentForNodes(std::initializer_list<Node*> nodes) const;
+
+  float GetScaleValueForNode(Node* node) const;
+
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales_;
+  const std::string name_scope_{"xpu_quantize_op_pass"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
new file mode 100644
index 0000000000000..8571dee220d3b
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -0,0 +1,312 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file eint8_outcept in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either eint8_outpress or
+// implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h"
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/utils/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+XPUQuantizeSquashPass::XPUQuantizeSquashPass() {}
+
+void XPUQuantizeSquashPass::FindNodesToKeep(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  patterns::DequantXPUAny deq_any_pattern{gpd.mutable_pattern(),
+                                          "dequant_xpu_any"};
+  deq_any_pattern();
+
+  int found_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, deq_any_pattern);
+
+    if (nodes_keep_counter->find(dequant_out) == nodes_keep_counter->end())
+      (*nodes_keep_counter)[dequant_out] = 1;
+    else
+      (*nodes_keep_counter)[dequant_out] += 1;
+
+    found_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_count);
+}
+
+void XPUQuantizeSquashPass::DequantQuantSquash(
+    Graph* graph,
+    std::unordered_map<const Node*, int>* nodes_keep_counter) const {
+  GraphPatternDetector gpd;
+  LOG(INFO) << "DequantQuantSquash COME IN";
+  patterns::DequantQuantXPUAny squash_pattern{gpd.mutable_pattern(),
+                                              "dequant_quant_xpu_any"};
+  squash_pattern();
+
+  int found_dequant_quant_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    LOG(INFO) << "squash dequantize-quantize ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
+
+    auto* next_op_desc = next_op->Op();
+    float dequant_scale =
+        PADDLE_GET_CONST(float, dequant_op->Op()->GetAttr("scale"));
+    float quant_scale =
+        PADDLE_GET_CONST(float, quant_op->Op()->GetAttr("scale"));
+
+    PADDLE_ENFORCE_NE(
+        nodes_keep_counter->find(dequant_out),
+        nodes_keep_counter->end(),
+        platform::errors::NotFound("The dequant output node is not found."));
+
+    // check if dequantize op should be kept or removed, decrease the counter
+    bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
+
+    int equal = dequant_scale == quant_scale ? 1 : 0;
+    if (dequant_scale == quant_scale || isnan(dequant_scale) ||
+        isnan(quant_scale) || isinf(dequant_scale) || isinf(quant_scale)) {
+      // squash dequantize-quantize to nothing
+
+      auto quant_out_var_name = quant_out->Name();
+      for (auto input_name : next_op_desc->InputNames()) {
+        auto& input_names = next_op_desc->MutableInputs()->at(input_name);
+        std::replace(input_names.begin(),
+                     input_names.end(),
+                     quant_out_var_name,
+                     dequant_in->Name());
+        next_op_desc->SetInput(input_name, input_names);
+      }
+
+      if (keep_dequant)
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+      else
+        GraphSafeRemoveNodes(graph,
+                             {dequant_op, quant_op, dequant_out, quant_out});
+
+      IR_NODE_LINK_TO(dequant_in, next_op);
+
+      found_dequant_quant_count++;
+    } else {
+      // squash dequantize-quantize to requantize op
+      //   OpDesc desc;
+      //   desc.SetType("requantize");
+      //   desc.SetInput("Input",
+      //   std::vector<std::string>({dequant_in->Name()}));
+      //   desc.SetOutput("Output",
+      //   std::vector<std::string>({quant_out->Name()}));
+      //   desc.SetAttr("Scale_in", dequant_scale);
+      //   desc.SetAttr("Shift_in", dequant_shift);
+      //   desc.SetAttr("Scale_out", quant_scale);
+      //   desc.SetAttr("Shift_out", quant_shift);
+
+      //   auto requant_op = g->CreateOpNode(&desc);
+
+      //   if (keep_dequant)
+      //     GraphSafeRemoveNodes(graph, {quant_op});
+      //   else
+      //     GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
+
+      //   IR_NODE_LINK_TO(dequant_in, requant_op);
+      //   IR_NODE_LINK_TO(requant_op, quant_out);
+
+      //   found_dequant_quant_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_dequant_quant_count);
+  PrettyLogDetail("---    squashed %d dequantize-quantize pairs",
+                  found_dequant_quant_count);
+}
+
+void XPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::OpDequantXPU op_dequant_pattern{gpd.mutable_pattern(),
+                                            "op_dequant_xpu"};
+  op_dequant_pattern();
+
+  int found_op_dequant_squash_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "squash op-dequant ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(any_op, any_op, op_dequant_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, op_dequant_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, op_dequant_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, op_dequant_pattern);
+
+    if (dequant_in->outputs.size() == 1) {
+      // Find the name of the output linking any_op to dequant_in
+      std::string output_name =
+          FindOutputNameByVarName(any_op->Op(), dequant_in->Name());
+
+      if (output_name.empty()) return;
+      any_op->Op()->SetAttr("out_dtype", dequant_out->Var()->GetDataType());
+      any_op->Op()->SetOutput(output_name,
+                              std::vector<std::string>({dequant_out->Name()}));
+      IR_NODE_LINK_TO(any_op, dequant_out);
+      GraphSafeRemoveNodes(graph, {dequant_in, dequant_op});
+      found_op_dequant_squash_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_op_dequant_squash_count);
+  PrettyLogDetail("---    squashed %d dequant with ops",
+                  found_op_dequant_squash_count);
+}
+
+// conv2d_xpu, fc_xpu
+void XPUQuantizeSquashPass::QuantOpSquash(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::QuantXPUAny quant_any_pattern{gpd.mutable_pattern(),
+                                          "quant_xpu_any"};
+  quant_any_pattern();
+
+  int found_quant_op_squash_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "squash op-dequant ops pair";
+
+    GET_IR_NODE_FROM_SUBGRAPH(quant_in, quant_in, quant_any_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_op, quant_op, quant_any_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, quant_any_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, quant_any_pattern);
+
+    if (quant_out->outputs.size() == 1) {
+      std::string input_name =
+          FindInputNameByVarName(next_op->Op(), quant_out->Name());
+
+      if (input_name.empty()) return;
+      // Only support quant + conv2d_xpu/fc_xpu fusion
+      if (!(next_op->Op()->Type() == "conv2d_xpu" ||
+            next_op->Op()->Type() == "fc_xpu")) {
+        return;
+      }
+      next_op->Op()->SetInput(input_name,
+                              std::vector<std::string>({quant_in->Name()}));
+      IR_NODE_LINK_TO(quant_in, next_op);
+      GraphSafeRemoveNodes(graph, {quant_out, quant_op});
+      found_quant_op_squash_count++;
+    }
+  };
+  gpd(graph, handler);
+  AddStatis(found_quant_op_squash_count);
+  PrettyLogDetail("---    squashed %d quantize with ops",
+                  found_quant_op_squash_count);
+}
+
+void XPUQuantizeSquashPass::MultipleQuantizeSquash(Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::MultipleQuantizeXPU multiple_quantize_pattern{
+      gpd.mutable_pattern(), "multiple_quantize_xpu"};
+  multiple_quantize_pattern();
+
+  int found_multiple_quantize_squash_count = 0;
+  int removed_quantize = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "fuse multiple quantize ops";
+
+    GET_IR_NODE_FROM_SUBGRAPH(prev_out, prev_out, multiple_quantize_pattern);
+
+    auto* first_quant_op = *(std::find_if(
+        prev_out->outputs.begin(), prev_out->outputs.end(), [&](Node* node) {
+          return (node->IsOp() && node->Op()->Type() == "quantize_xpu");
+        }));
+    auto* first_quant_out = first_quant_op->outputs[0];
+    float scale = first_quant_op->Op()->GetAttrIfExists<float>("scale");
+
+    PADDLE_ENFORCE_NE(scale,
+                      0,
+                      platform::errors::InvalidArgument(
+                          "Quantize scale(%f) should not be equal 0.", scale));
+
+    for (int iter = prev_out->outputs.size() - 1; iter >= 0; iter--) {
+      auto quant_op = prev_out->outputs[iter];
+      if (quant_op->IsOp() && quant_op->Op()->Type() == "quantize_xpu" &&
+          quant_op->id() != first_quant_op->id() &&
+          quant_op->Op()->GetAttrIfExists<float>("scale") == scale) {
+        auto quant_out = quant_op->outputs[0];
+        auto last_op = quant_out->outputs[0];
+        auto last_op_op = last_op->Op();
+
+        std::string last_op_input_name =
+            FindInputNameByVarName(last_op_op, quant_out->Name());
+
+        PADDLE_ENFORCE_NE(
+            last_op_input_name.empty(),
+            true,
+            platform::errors::NotFound("Operator after quantize operator(%s) "
+                                       "should have quantize output as input.",
+                                       quant_out->Name()));
+
+        // update the next operator input,
+        // by replacing quant_out with first_quant_out
+        auto last_op_names = last_op->Op()->Inputs().at(last_op_input_name);
+        std::replace(last_op_names.begin(),
+                     last_op_names.end(),
+                     quant_out->Name(),
+                     first_quant_out->Name());
+        last_op_op->SetInput(last_op_input_name,
+                             std::vector<std::string>(last_op_names));
+
+        IR_NODE_LINK_TO(first_quant_out, last_op);
+        GraphSafeRemoveNodes(graph, {quant_op, quant_out});
+        removed_quantize++;
+      }
+    }
+    found_multiple_quantize_squash_count++;
+  };
+  gpd(graph, handler);
+  AddStatis(found_multiple_quantize_squash_count);
+  PrettyLogDetail("---    squashed %d quantize op", removed_quantize);
+}
+
+void XPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph,
+      platform::errors::InvalidArgument(
+          "The graph in function XPUQuantizeSquashPass::ApplyImpl is null."));
+  FusePassBase::Init("xpu_quantize_squash_pass", graph);
+
+  std::unordered_map<const Node*, int> nodes_keep_counter;
+  FindNodesToKeep(graph, &nodes_keep_counter);
+  DequantQuantSquash(graph, &nodes_keep_counter);
+  OpDequantSquash(graph);
+  // QuantOpSquash(graph);
+  MultipleQuantizeSquash(graph);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(xpu_quantize_squash_pass,
+              paddle::framework::ir::XPUQuantizeSquashPass);
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
new file mode 100644
index 0000000000000..fbfa967791304
--- /dev/null
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
@@ -0,0 +1,110 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Squash dequantize->quantize pair pattern into requantize op
+ */
+
+class XPUQuantizeSquashPass : public FusePassBase {
+ public:
+  XPUQuantizeSquashPass();
+  virtual ~XPUQuantizeSquashPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+  /*
+   * For each dequantize's output find the number of operators it is an input to
+   */
+  void FindNodesToKeep(
+      Graph* graph,
+      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  /*
+   * Don't squash unsigned dequantize with signed quantize.
+   * This is important for concat and elementwise ops.
+   * When inputs have different sign, concat will assume signed type and
+   * elementwise assumes first input type.
+   */
+  bool IsDequantizeQuantizeIncompatible(Node* quant_op,
+                                        Node* dequant_op,
+                                        Node* next_op) const;
+
+  /*
+   * Squash dequantize-quantize ops pairs into requantize or nothing
+   */
+  void DequantQuantSquash(
+      Graph* graph,
+      std::unordered_map<const Node*, int>* nodes_keep_counter) const;
+
+  /*
+   * Squash requantize op into conv with scale_out like requantize scale_out
+   */
+  void OpRequantSquash(Graph* graph) const;
+
+  /*
+   * Squash requantize op if the next operator's input scale can be updated
+   */
+  void RequantOpSquash(Graph* graph) const;
+
+  /*
+   * Squash dequant if the previous operator has force_fp32_output attribute
+   */
+  void OpDequantSquash(Graph* graph) const;
+
+  /*
+   * Squash quantize if several quatize ops have the same scale
+   */
+  void MultipleQuantizeSquash(Graph* graph) const;
+
+  /*
+   * Squash scale if dequantize is before scale
+   */
+  void DequantScaleSquash(Graph* graph) const;
+
+  /*
+   * Squash scale if scale is before quantize
+   */
+  void ScaleQuantSquash(Graph* graph) const;
+
+  /*
+   * Squash quantize if is before bfloat16 conv2d or fused_conv2d
+   */
+  void QuantizeBf16Conv(Graph* graph) const;
+
+  void QuantizeBf16ConvImpl(Graph* graph, const std::string& conv_type) const;
+
+  /*
+   * Squash quantize if is before conv2d_xpu/fc_xpuy
+   */
+  void QuantOpSquash(Graph* graph) const;
+
+  const std::string name_scope_{"squash"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 65fd8a74aa101..41d2ccd67b43a 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -561,10 +561,12 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "fast_where_xpu_fuse_pass",
       "elementwise_mul_add_fuse_pass",
       "link_xpu_op_max_pass",
-      "delete_isolated_node_pass",
       // "auto_mixed_precision_pass",
       "cast_mixed_precision_op_fuse_pass",
-      "auto_trans_quantize_op_precision_pass",
+      "xpu_quantize_op_pass",
+      "xpu_quantize_squash_pass",
+      // "auto_trans_quantize_op_precision_pass",
+      "delete_isolated_node_pass",
       "inplace_op_var_pass",
   });
   use_xpu_ = true;
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 4e67144ba8a89..14188449b2def 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -657,6 +657,16 @@
     func : depthwise_conv2d
   backward : depthwise_conv2d_grad
 
+- op : dequantize_xpu
+  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  output : Tensor(y)
+  infer_meta :
+    func : DeQuantizeXPUInferMeta
+  kernel :
+    func : dequantize_xpu
+    data_type: x
+  optional : max
+
 - op : det
   args : (Tensor x)
   output : Tensor
@@ -2017,6 +2027,16 @@
     func : qr
   backward : qr_grad
 
+- op : quantize_xpu
+  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  output : Tensor(y)
+  infer_meta :
+    func : QuantizeXPUInferMeta
+  kernel :
+    func : quantize_xpu
+    data_type : x
+  optional : max
+
 - op : real
   args : (Tensor x)
   output : Tensor (out)
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index f8139af52cb22..a4109197521a5 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -212,6 +212,8 @@ XPUOpMap& get_kl2_ops() {
        XPUKernelSet({phi::DataType::FLOAT32})},
       {"depthwise_conv2d_transpose",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"dequantize_xpu",
+       XPUKernelSet({phi::DataType::INT16, phi::DataType::INT8})},
       {"diag_v2",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::FLOAT16,
@@ -615,6 +617,8 @@ XPUOpMap& get_kl2_ops() {
       {"prelu_grad",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"prod_raw", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"quantize_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"range",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT64,
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 2fd87760378fc..5b31803127eb3 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -978,6 +978,16 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                 config);
 }
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            const MetaTensor& max,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -2597,6 +2607,16 @@ void PriorBoxInferMeta(const MetaTensor& input,
   var->set_dims(phi::make_ddim(dim_vec));
 }
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          const MetaTensor& max,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
                                               const MetaTensor& repeats,
                                               int dim,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 94d8bb606ea5d..aa469554b6fd4 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -155,6 +155,12 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                             MetaTensor* out,
                             MetaConfig config = MetaConfig());
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            const MetaTensor& max,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y);
+
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -408,6 +414,12 @@ void PriorBoxInferMeta(const MetaTensor& input,
                        MetaTensor* out,
                        MetaTensor* var);
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          const MetaTensor& max,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y);
+
 void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            const MetaTensor& value,
                            bool out_int32,
diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc
new file mode 100644
index 0000000000000..20423c1eb8920
--- /dev/null
+++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename TX, typename TY, typename Context>
+void DeQuantizeKernelImpl(const Context& ctx,
+                          const DenseTensor& x,
+                          const paddle::optional<DenseTensor>& max,
+                          DenseTensor* y) {
+  using XPUInX = typename XPUTypeTrait<TX>::Type;
+  using XPUOutY = typename XPUTypeTrait<TY>::Type;
+
+  auto* y_data = ctx.template Alloc<TY>(y);
+  const auto* x_data = x.data<TX>();
+  int64_t len = x.numel();
+  const float* max_data =
+      max.get_ptr() == nullptr ? nullptr : max->data<float>();
+  int r = xpu::dequantization<XPUInX, XPUOutY>(
+      ctx.x_context(),
+      reinterpret_cast<const XPUInX*>(x_data),
+      reinterpret_cast<XPUOutY*>(y_data),
+      len,
+      max_data);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "dequantization");
+}
+
+template <typename T, typename Context>
+void DeQuantizeKernel(const Context& ctx,
+                      const DenseTensor& x,
+                      const paddle::optional<DenseTensor>& max,
+                      DataType out_dtype,
+                      float scale,
+                      DenseTensor* y) {
+  switch (out_dtype) {
+    case DataType::FLOAT32:
+      DeQuantizeKernelImpl<T, float, Context>(ctx, x, max, y);
+      break;
+    case DataType::FLOAT16:
+      DeQuantizeKernelImpl<T, dtype::float16, Context>(ctx, x, max, y);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::Unavailable(
+          "Not supported Quantize data type from %d -> %d ",
+          x.dtype(),
+          out_dtype));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    dequantize_xpu, XPU, ALL_LAYOUT, phi::DeQuantizeKernel, int16_t, int8_t) {}
diff --git a/paddle/phi/kernels/xpu/quantization_kernel.cc b/paddle/phi/kernels/xpu/quantization_kernel.cc
new file mode 100644
index 0000000000000..01f6ddad93aa0
--- /dev/null
+++ b/paddle/phi/kernels/xpu/quantization_kernel.cc
@@ -0,0 +1,70 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename TX, typename TY, typename Context>
+void QuantizeKernelImpl(const Context& ctx,
+                        const DenseTensor& x,
+                        const paddle::optional<DenseTensor>& max,
+                        DenseTensor* y) {
+  using XPUInX = typename XPUTypeTrait<TX>::Type;
+  using XPUOutY = typename XPUTypeTrait<TY>::Type;
+
+  auto* y_data = ctx.template Alloc<TY>(y);
+  const auto* x_data = x.data<TX>();
+  int64_t len = x.numel();
+  const float* max_data =
+      max.get_ptr() == nullptr ? nullptr : max->data<float>();
+  int r = xpu::quantization<XPUInX, XPUOutY>(
+      ctx.x_context(),
+      reinterpret_cast<const XPUInX*>(x_data),
+      reinterpret_cast<XPUOutY*>(y_data),
+      len,
+      max_data);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
+}
+
+template <typename T, typename Context>
+void QuantizeKernel(const Context& ctx,
+                    const DenseTensor& x,
+                    const paddle::optional<DenseTensor>& max,
+                    DataType out_dtype,
+                    float scale,
+                    DenseTensor* y) {
+  switch (out_dtype) {
+    case DataType::INT16:
+      QuantizeKernelImpl<T, int16_t, Context>(ctx, x, max, y);
+      break;
+    case DataType::INT8:
+      QuantizeKernelImpl<T, int8_t, Context>(ctx, x, max, y);
+      break;
+    default:
+      PADDLE_THROW(phi::errors::Unavailable(
+          "Not supported quantize data type from %d -> %d ",
+          x.dtype(),
+          out_dtype));
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(quantize_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::QuantizeKernel,
+                   float,
+                   phi::dtype::float16) {}

From da5cb07f2e95c451378043989db64b570a990030 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Thu, 19 Oct 2023 14:26:50 +0800
Subject: [PATCH 04/15] support fp16 fix

---
 .../framework/ir/auto_mixed_precision_pass.cc |  11 +-
 .../ir/delete_quant_dequant_linear_op_pass.cc |  10 +-
 .../delete_weight_dequant_linear_op_pass.cc   |  28 +--
 paddle/fluid/framework/ir/graph.h             |  21 ++
 .../ir/quantize_related_pass_utils.h          | 103 +++++++++-
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 184 ++++++++----------
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 105 +++++-----
 .../framework/ir/xpu/link_xpu_op_max_pass.cc  |  14 +-
 .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc   |  29 +--
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  | 112 ++++++++---
 .../framework/ir/xpu/xpu_quantize_op_pass.h   |   7 +-
 .../passes/convert_to_mixed_precision.cc      |   4 -
 12 files changed, 379 insertions(+), 249 deletions(-)

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index 497dcae8395d5..fe5ec348bf707 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -524,7 +524,6 @@ void AutoMixedPrecisionPass::UpdateOpPrecision() const {
             vars_should_not_low_precision.insert(in_var_node->Var()->Name());
           }
         }
-
         // when op_1 only support cpu kernel. if op_2's intput var is op_1's
         // output var, then op_2 should not run at low precision.
         if (GetOpOriginalType(op_type) != "feed" &&
@@ -688,6 +687,16 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert(
       if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
         return true;
       }
+    } else if (GetOpOriginalType(op_desc->Type()) == "quantize_linear" ||
+               GetOpOriginalType(op_desc->Type()) == "dequantize_linear") {
+      auto vecs = op_desc->Input("Scale");
+      if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+        return true;
+      }
+      vecs = op_desc->Input("ZeroPoint");
+      if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+        return true;
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 9245305889907..0a9fc07a7cb07 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -91,6 +91,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
 
   GraphPatternDetector gpd;
   auto* scope = param_scope();
+  BlockDesc* block = nullptr;
   PADDLE_ENFORCE_NOT_NULL(
       scope,
       platform::errors::InvalidArgument(
@@ -113,6 +114,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
     */
+    block = quantize_linear_op->Op()->Block();
     std::unordered_set<const Node*> nodes2rm = {};
 
     // Get input scale from tensor
@@ -140,15 +142,10 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
 
     int nums_any_ops =
         static_cast<int>(dequantize_linear_op_out->outputs.size());
-    int bit_length =
-        PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
     for (int i = 0; i < nums_any_ops; ++i) {
       auto* any_op_desc = dequantize_linear_op_out->outputs[i]->Op();
       any_op_desc->SetAttr("Input_scale_" + quantize_linear_op_x->Var()->Name(),
                            input_scale);
-      any_op_desc->SetAttr(
-          "Input_bit_length_" + quantize_linear_op_x->Var()->Name(),
-          bit_length);
       if (!var_quant_scales.count(quantize_linear_op_x->Var()->Name())) {
         var_quant_scales.insert(
             std::make_pair(quantize_linear_op_x->Var()->Name(),
@@ -174,8 +171,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   gpd(graph, handler);
   AddStatis(found_count);
 
-  // save var_quant_scales in the temporary save op's attr01
-  SaveInfoInTheTmpOp(
+  SaveQuantInfoInTheGraph(
       graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 0140fb664b1de..968120068b92a 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -35,7 +35,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                     true,
                     platform::errors::InvalidArgument(
                         "Graph must have kParamScopeAttr attribute."));
-  VLOG(1) << "Handle delete weight dequant linear op pass ...";
+  VLOG(3) << "Handle delete weight dequant linear op pass ...";
   auto& scope = graph->Get<framework::Scope>(kParamScopeAttr);
   bool is_int8 = false;
 
@@ -44,10 +44,10 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
-      VLOG(1) << "Dequantize linear op Type: " << op->Type();
       if (op->Type() == "dequantize_linear") {
-        VLOG(1) << "Dequantize linear op is come in: " << op->Type();
-        Node *weight_var_node, *calcu_op_node, *while_op_node;
+        Node* weight_var_node = nullptr;
+        Node* calcu_op_node = nullptr;
+        Node* while_op_node = nullptr;
         Node *dequantized_weight_var_node = nullptr, *scale_var_node = nullptr;
         // 1. Judge whether for dequant weight and find
         // weight_var_node/scale_var_node
@@ -60,9 +60,12 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
               scale_var_node = input_node;
             }
           } else {
-            return;
+            break;
           }
         }
+        if (weight_var_node == nullptr || scale_var_node == nullptr) {
+          continue;
+        }
         // 2. Find next_op_node
         // For while op: delete its input which is related to dequantized
         // For calculation op: set weight scale as their attributes
@@ -107,7 +110,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                     }
                   } else {
                     PADDLE_THROW(platform::errors::Unimplemented(
-                        "The dtype of quantization scale must be FP32/16, "
+                        "The dtype of quantization scale must be FP32/FP16, "
                         "but received %d, which is not supported.",
                         weight_scale_tensor->dtype()));
                   }
@@ -147,13 +150,12 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                             weight_scale_nums));
                     calcu_op_desc->SetAttr("weight_scale", weight_scale);
                   }
-                  calcu_op_desc->SetAttr("weight_quant_axis", quant_axis);
-                  calcu_op_desc->SetAttr("weight_bit_length", bit_length);
-                  calcu_op_desc->SetAttr("enable_int8", true);
-                  VLOG(1) << "dequantized_weight_var_node->Var()->Name():"
-                          << dequantized_weight_var_node->Var()->Name();
-                  VLOG(1) << "weight_var_node->Var()->Name(): "
-                          << weight_var_node->Var()->Name();
+                  if (bit_length == 8) {
+                    // Current 8-bit quantization only supports int8
+                    calcu_op_desc->SetAttr("op_weights_precision",
+                                           std::string("int8"));
+                  }
+
                   calcu_op_desc->RenameInput(
                       dequantized_weight_var_node->Var()->Name(),
                       weight_var_node->Var()->Name());
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index 3596f4e0f0e29..e29e5a2a9a9d2 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -177,7 +177,11 @@ class Graph {
         platform::errors::AlreadyExists(
             "The attribute %s to be set already exists in the graph.",
             attr_name));
+    VLOG(1) << "set attribute " << attr_name;
     attrs_[attr_name] = attr;
+    VLOG(1) << "attrs_ size " << attrs_.size();
+    std::vector<std::string> attr_names = AttrNames();
+    VLOG(1) << "attr_names size " << attr_names.size();
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
       delete attr;
@@ -412,6 +416,23 @@ class Graph {
     return sub_graphs_.size();
   }
 
+  std::vector<std::string> AttrNames() const {
+    VLOG(1) << "graph addr:" << this;
+    if (FLAGS_convert_all_blocks) {
+      if (IsMainGraph()) {
+        return GetSubGraph(0)->AttrNames();
+      }
+    }
+    std::vector<std::string> res;
+    res.reserve(attrs_.size());
+    VLOG(1) << "AttrNames attr size: " << attrs_.size();
+    for (auto &attr : attrs_) {
+      res.push_back(attr.first);
+      VLOG(1) << "AttrNames: " << attr.first;
+    }
+    return res;
+  }
+
  private:
   // TODO(levi): delete this interface after when we can convert all
   // blocks into sub_graphs.
diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
index ce97cdd5fee33..d6c54cef47d90 100644
--- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h
+++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
@@ -17,12 +17,13 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-static void SaveInfoInTheTmpOp(
+static inline void SaveInfoInTheTmpOp(
     ir::Graph* graph,
     const std::string& flag,
     const std::string& key_suffix,
@@ -40,6 +41,20 @@ static void SaveInfoInTheTmpOp(
   }
 }
 
+static inline void SaveQuantInfoInTheGraph(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map) {
+  VLOG(1) << "Save quant info in the graph!";
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  graph->Set(flag, new bool(true));
+  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+    VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix;
+    graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
+  }
+}
+
 static void GetInfoFromTheTmpOp(
     ir::Graph* graph,
     const std::string& flag,
@@ -51,34 +66,104 @@ static void GetInfoFromTheTmpOp(
   for (auto* op_node :
        ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
     if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue;
-    VLOG(5) << "Come in save op";
+    VLOG(1) << "Come in save op";
     auto* op_desc = op_node->Op();
     if (op_desc->GetAttrIfExists<bool>(flag)) {
-      VLOG(5) << "flag is true";
+      VLOG(1) << "flag is true";
       op_desc->RemoveAttr(flag);
       std::vector<std::string> attr_names = op_desc->AttrNames();
+      VLOG(1) << "attr_names size:" << attr_names.size();
       for (auto fake_name : attr_names) {
-        VLOG(5) << "fake_name:" << fake_name;
+        VLOG(1) << "fake_name:" << fake_name;
         size_t pos = fake_name.find(suffix);
         if (pos != std::string::npos) {
           std::string name = fake_name.substr(0, pos);
-          VLOG(5) << "name:" << name;
+          VLOG(1) << "name:" << name;
           auto scales_vector =
               PADDLE_GET_CONST(std::vector<float>, op_desc->GetAttr(fake_name));
-          VLOG(5) << "scales_vector:" << scales_vector[0];
+          VLOG(1) << "scales_vector:" << scales_vector[0];
           info_map->insert(std::make_pair(name, scales_vector));
-          VLOG(5) << "insert success:";
+          VLOG(1) << "insert success:";
           op_desc->RemoveAttr(fake_name);
-          VLOG(5) << "remove success:";
+          VLOG(1) << "remove success:";
         }
       }
       graph->RemoveNode(op_node);
-      VLOG(5) << "remove op node success:";
+      VLOG(1) << "remove op node success:";
       break;
     }
   }
 }
 
+static inline void GetQuantInfoFromTheGraph(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    std::unordered_map<std::string, std::vector<float>>* info_map) {
+  VLOG(1) << "Get quant info from the graph attrs!";
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0);
+  if (graph->Has(flag)) {
+    std::vector<std::string> attr_names = graph->AttrNames();
+    VLOG(1) << "attr_names size:" << attr_names.size();
+    for (auto fake_name : attr_names) {
+      VLOG(1) << "fake_name:" << fake_name;
+      size_t pos = fake_name.find(suffix);
+      if (pos != std::string::npos) {
+        std::string name = fake_name.substr(0, pos);
+        VLOG(1) << "name:" << name;
+        auto scales_vector = graph->Get<std::vector<float>>(fake_name);
+        VLOG(1) << "scales_vector:" << scales_vector[0];
+        info_map->insert(std::make_pair(name, scales_vector));
+      }
+    }
+  }
+}
+
+static inline std::unordered_map<std::string, std::vector<float>>
+GetQuantInfoFromTheGraph(ir::Graph* graph,
+                         const std::string& flag,
+                         const std::string& key_suffix) {
+  std::unordered_map<std::string, std::vector<float>> info_map;
+  VLOG(1) << "Get quant info from the graph attrs!";
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0);
+  if (graph->Has(flag)) {
+    std::vector<std::string> attr_names = graph->AttrNames();
+    VLOG(1) << "attr_names size:" << attr_names.size();
+    for (auto fake_name : attr_names) {
+      VLOG(1) << "fake_name:" << fake_name;
+      size_t pos = fake_name.find(suffix);
+      if (pos != std::string::npos) {
+        std::string name = fake_name.substr(0, pos);
+        VLOG(1) << "name:" << name;
+        auto scales_vector = graph->Get<std::vector<float>>(fake_name);
+        VLOG(1) << "scales_vector:" << scales_vector[0];
+        info_map.insert(std::make_pair(name, scales_vector));
+      }
+    }
+  }
+  return info_map;
+}
+
+static inline bool AreScalesPresentForNodes(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    std::initializer_list<Node*> nodes) {
+  bool present = true;
+  for (auto node : nodes) {
+    if (var_quant_scales->count(node->Name()) == 0) {
+      present = false;
+    }
+  }
+  return present;
+}
+
+static inline float GetScaleValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node) {
+  return var_quant_scales->at(node->Name())[0];
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 19e006d535409..0605f2355ce2b 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -20,6 +20,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -370,7 +371,7 @@ class Conv2dXPUFusePass : public FusePassBase {
       bool with_conv_bias,
       bool with_bn,
       bool with_scale,
-      bool enable_int8) const;
+      std::string op_weights_precision) const;
 
   void CreateFusionInputs(
       ir::Graph* graph,
@@ -378,7 +379,9 @@ class Conv2dXPUFusePass : public FusePassBase {
       BlockDesc* block,
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
       std::map<std::string, Node*>* fusion_nodes_map,
-      bool enable_int8) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   void CreateFusionBranch(
       ir::Graph* graph,
@@ -386,7 +389,9 @@ class Conv2dXPUFusePass : public FusePassBase {
       BlockDesc* block,
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
       std::map<std::string, Node*>* fusion_nodes_map,
-      bool enable_int8) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   void CreateFusionOutputs(
       ir::Graph* graph,
@@ -394,8 +399,10 @@ class Conv2dXPUFusePass : public FusePassBase {
       BlockDesc* block,
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
       std::map<std::string, Node*>* fusion_nodes_map,
-      bool enable_int8,
-      std::string act_type) const;
+      std::string op_weights_precision,
+      std::string act_type,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   const std::unordered_set<std::string> support_quant_op_type_{"conv2d",
                                                                "conv2d_xpu"};
@@ -475,7 +482,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
     bool with_conv_bias,
     bool with_bn,
     bool with_scale,
-    bool enable_int8) const {
+    std::string op_weights_precision) const {
   // Get Node
   auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
   PADDLE_ENFORCE_EQ(
@@ -589,7 +596,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
       bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
     }
     // recompute the weights
-    if (!enable_int8) {
+    if (op_weights_precision != "int8") {
       float* filter_ptr =
           filter_t->mutable_data<float>(paddle::platform::CPUPlace());
       for (int i = 0; i < mean_len; ++i) {
@@ -631,7 +638,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   }
 
   // deal with scale op
-  if (with_scale && !enable_int8) {
+  if (with_scale) {
     auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale");
     PADDLE_ENFORCE_EQ(
         scale != nullptr,
@@ -657,10 +664,16 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
       }
     }
     // recompute weight as scale op
-    float* filter_ptr =
-        filter_t->mutable_data<float>(paddle::platform::CPUPlace());
-    for (int i = 0; i < filter_len; ++i) {
-      filter_ptr[i] *= scale_val_;
+    if (op_weights_precision != "int8") {
+      float* filter_ptr =
+          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      for (int i = 0; i < filter_len; ++i) {
+        filter_ptr[i] *= scale_val_;
+      }
+    } else {
+      for (int i = 0; i < weight_scale.size(); i++) {
+        weight_scale[i] *= scale_val_;
+      }
     }
   }
 
@@ -669,7 +682,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   Node* filter_intx = nullptr;
   Node* filter_max = nullptr;
   Node* scale_max = nullptr;
-  if (!enable_int8) {
+  if (op_weights_precision != "int8") {
     PrepareWeight<float, int16_t>(graph,
                                   scope,
                                   block,
@@ -735,7 +748,9 @@ void Conv2dXPUFusePass::CreateFusionInputs(
     BlockDesc* block,
     const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
     std::map<std::string, Node*>* fusion_nodes_map,
-    bool enable_int8) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   // Get Node
   auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
   PADDLE_ENFORCE_EQ(
@@ -750,9 +765,13 @@ void Conv2dXPUFusePass::CreateFusionInputs(
   // input max
   std::string conv_input_max_name = input->Name() + "_input_max";
   Node* conv2d_xpu_input_max = nullptr;
-  if (enable_int8) {
-    float input_scale =
-        conv->Op()->GetAttrIfExists<float>("Input_scale_" + input->Name());
+  if (op_weights_precision == "int8") {
+    PADDLE_ENFORCE_EQ(AreScalesPresentForNodes(var_quant_scales, {input}),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "When conv op is running in int8 precision, the "
+                          "scales of input var should be present in!"));
+    float input_scale = GetScaleValueForNode(var_quant_scales, input);
     int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
     VarDesc conv_input_max_desc(conv_input_max_name);
     conv_input_max_desc.SetPersistable(
@@ -782,7 +801,9 @@ void Conv2dXPUFusePass::CreateFusionBranch(
     BlockDesc* block,
     const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
     std::map<std::string, Node*>* fusion_nodes_map,
-    bool enable_int8) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   // Get Node
   auto* ew_branch_add =
       GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add");
@@ -798,7 +819,7 @@ void Conv2dXPUFusePass::CreateFusionBranch(
     std::string ew_branch_add_max_name =
         ew_branch_add_in->Name() + "branch_max";
     Node* ew_branch_add_max = FindNodeWithName(graph, ew_branch_add_max_name);
-    if (enable_int8 && !ew_branch_add_max) {
+    if (op_weights_precision == "int8" && !ew_branch_add_max) {
       int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
       VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name);
       ew_branch_add_in_max_desc.SetPersistable(
@@ -808,15 +829,19 @@ void Conv2dXPUFusePass::CreateFusionBranch(
       ew_branch_add_in_max_desc.SetDataType(
           proto::VarType::Type::VarType_Type_FP32);
       ew_branch_add_max = graph->CreateVarNode(&ew_branch_add_in_max_desc);
-      float ew_branch_add_scale = ew_branch_add->Op()->GetAttrIfExists<float>(
-          "Input_scale_" + ew_branch_add_in->Name());
+      PADDLE_ENFORCE_EQ(
+          AreScalesPresentForNodes(var_quant_scales, {ew_branch_add_in}),
+          true,
+          platform::errors::InvalidArgument(
+              "When conv op is running in int8 precision with branch add, the "
+              "scales of branch var should be present in!"));
+      float ew_branch_add_scale =
+          GetScaleValueForNode(var_quant_scales, ew_branch_add_in);
       auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
       PADDLE_ENFORCE_EQ(
           conv != nullptr,
           true,
           platform::errors::InvalidArgument("conv node ptr can not be null"));
-      conv->Op()->SetAttr("Input_scale_" + ew_branch_add_in->Name(),
-                          ew_branch_add_scale);
       auto ew_branch_add_max_tensor =
           scope->Var(ew_branch_add_max_name)->GetMutable<phi::DenseTensor>();
       ew_branch_add_max_tensor->set_type(phi::DataType::FLOAT32);
@@ -839,8 +864,10 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
     BlockDesc* block,
     const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
     std::map<std::string, Node*>* fusion_nodes_map,
-    bool enable_int8,
-    std::string act_type) const {
+    std::string op_weights_precision,
+    std::string act_type,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
   PADDLE_ENFORCE_EQ(
       conv != nullptr,
@@ -921,7 +948,8 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
   (*fusion_nodes_map)["out"] = conv2d_out_var_node;
 
   // Create out max in
-  if (enable_int8) {
+  if (op_weights_precision == "int8" &&
+      AreScalesPresentForNodes(var_quant_scales, {conv2d_out_var_node})) {
     std::string conv_out_max_in_name = conv2d_xpu_out_name + "_max_in";
     int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
     VarDesc conv_out_max_in_desc(conv_out_max_in_name);
@@ -934,25 +962,8 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
     block_out_max_in_desc->SetShape(conv_out_max_in_desc.GetShape());
     block_out_max_in_desc->SetDataType(conv_out_max_in_desc.GetDataType());
 
-    auto GetOutputScale = [&](Node* var_node, std::string name) -> float {
-      int nums_any_ops = var_node->outputs.size();
-      for (size_t i = 0; i < nums_any_ops; ++i) {
-        auto* any_op_desc = conv2d_out_var_node->outputs[i]->Op();
-        VLOG(1) << "any_op_desc: " << any_op_desc->Type();
-        if (any_op_desc->HasAttr("Input_scale_" + name)) {
-          VLOG(1) << "find it: "
-                  << "Input_scale_" + name;
-          return any_op_desc->GetAttrIfExists<float>("Input_scale_" + name);
-        }
-      }
-      return 0;
-    };
     float output_scale =
-        GetOutputScale(conv2d_out_var_node, conv2d_xpu_out_name);
-    conv->Op()->SetAttr("Input_scale_" + conv2d_xpu_out_name, output_scale);
-    VLOG(1) << "conv2d_xpu_out_name:" << conv2d_xpu_out_name
-            << " output_scale: " << output_scale
-            << "conv2d_out_var_node name:" << conv2d_out_var_node->Name();
+        GetScaleValueForNode(var_quant_scales, conv2d_out_var_node);
     phi::DenseTensor out_max_in_cpu_tensor;
     auto* cpu_ctx = static_cast<phi::CPUContext*>(
         platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
@@ -995,7 +1006,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
   auto* scope = param_scope();
   PADDLE_ENFORCE_NOT_NULL(
       scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales =
+      GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
@@ -1066,9 +1078,12 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                                   {"out", nullptr},
                                                   {"out_max", nullptr}};
 
-    bool enable_int8 = conv->Op()->GetAttrIfExists<bool>("enable_int8");
-    std::string op_precision_str = enable_int8 ? "int8" : "fp32";
-    VLOG(4) << "Conv2d fusion fuse pass is running on " << op_precision_str
+    std::string op_weights_precision = "float32";
+    if (conv->Op()->HasAttr("op_weights_precision")) {
+      op_weights_precision =
+          conv->Op()->GetAttrIfExists<std::string>("op_weights_precision");
+    }
+    VLOG(4) << "Conv2d fusion fuse pass is running on " << op_weights_precision
             << " precision!";
     auto* block = conv->Op()->Block();
     CreateFusionWeightsAndBias(graph,
@@ -1079,45 +1094,31 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                with_conv_bias,
                                with_bn,
                                with_scale,
-                               enable_int8);
-    VLOG(1) << "CreateFusionWeightsAndBias success!";
-    CreateFusionInputs(
-        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
-    VLOG(1) << "CreateFusionInputs success!";
-    CreateFusionBranch(
-        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
-    VLOG(1) << "CreateFusionBranch success!";
+                               op_weights_precision);
+    CreateFusionInputs(graph,
+                       scope,
+                       block,
+                       nodes_map,
+                       &fusion_nodes_map,
+                       op_weights_precision,
+                       &var_quant_scales);
+    CreateFusionBranch(graph,
+                       scope,
+                       block,
+                       nodes_map,
+                       &fusion_nodes_map,
+                       op_weights_precision,
+                       &var_quant_scales);
     CreateFusionOutputs(graph,
                         scope,
                         block,
                         nodes_map,
                         &fusion_nodes_map,
-                        enable_int8,
-                        act_type);
-    VLOG(1) << "CreateFusionOutputs success!";
-    // int out_dtype = PADDLE_GET_CONST(int, conv->Op()->GetAttr("out_dtype"));
-    // if (out_dtype == proto::VarType::Type::VarType_Type_INT8) {
-    //   fusion_nodes_map["out"]->Var()->SetDataType(
-    //       proto::VarType::Type::VarType_Type_INT8);
-    //   if (fusion_nodes_map["branch"]) {
-    //     fusion_nodes_map["branch"]->Var()->SetDataType(
-    //         proto::VarType::Type::VarType_Type_INT8);
-    //   }
-    // }
-    // Generate conv2d_xpu op
+                        op_weights_precision,
+                        act_type,
+                        &var_quant_scales);
+
     framework::OpDesc conv2d_xpu_op_desc(block);
-    for (auto [first, second] : fusion_nodes_map) {
-      VLOG(1) << "first: " << first << " second: " << second;
-      if (first == "x" || first == "out" || first == "out_max" ||
-          first == "branch")
-        continue;
-      if (second != nullptr) {
-        auto* temp_tensor =
-            scope->FindVar(second->Name())->GetMutable<phi::DenseTensor>();
-        VLOG(1) << *temp_tensor;
-      }
-    }
-    // set input&output var
     conv2d_xpu_op_desc.SetType("conv2d_xpu");
     conv2d_xpu_op_desc.SetInput("x", {fusion_nodes_map["x"]->Name()});
     if (fusion_nodes_map["x_max"]) {
@@ -1159,7 +1160,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                     {fusion_nodes_map["branch_max"]->Name()});
       }
     }
-    VLOG(1) << "creat conv2d_xpu_op_desc success!";
     // set attrs of conv2d_xpu
     float act_param_ = 0.0f;
     if (!act_type.empty()) {
@@ -1199,26 +1199,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     // out_dtype is same to input precision
     conv2d_xpu_op_desc.SetAttr("out_dtype",
                                fusion_nodes_map["x"]->Var()->GetDataType());
-    conv2d_xpu_op_desc.SetAttr(
-        "enable_int8", conv->Op()->GetAttrIfExists<bool>("enable_int8"));
-    if (enable_int8) {
-      conv2d_xpu_op_desc.SetAttr(
-          "Input_scale_" + fusion_nodes_map["out"]->Name(),
-          conv->Op()->GetAttrIfExists<float>("Input_scale_" +
-                                             fusion_nodes_map["out"]->Name()));
-      conv2d_xpu_op_desc.SetAttr(
-          "Input_scale_" + fusion_nodes_map["x"]->Name(),
-          conv->Op()->GetAttrIfExists<float>("Input_scale_" +
-                                             fusion_nodes_map["x"]->Name()));
-      if (fusion_nodes_map["branch"]) {
-        conv2d_xpu_op_desc.SetAttr(
-            "Input_scale_" + fusion_nodes_map["branch"]->Name(),
-            conv->Op()->GetAttrIfExists<float>(
-                "Input_scale_" + fusion_nodes_map["branch"]->Name()));
-      }
-    }
+    conv2d_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision);
 
-    VLOG(1) << "Set attr success!";
     // Link node
     auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
     IR_NODE_LINK_TO(fusion_nodes_map["x"], conv2d_xpu);
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index f087b7caf20ab..cb007fd435178 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -253,7 +254,7 @@ class FcXPUFusePass : public FusePassBase {
       std::map<std::string, Node*>* fusion_nodes_map,
       bool with_bias,
       bool with_bn,
-      bool enable_int8) const;
+      std::string op_weights_precision) const;
 
   void CreateFusionOutputs(
       ir::Graph* graph,
@@ -261,7 +262,9 @@ class FcXPUFusePass : public FusePassBase {
       BlockDesc* block,
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
       std::map<std::string, Node*>* fusion_nodes_map,
-      bool enable_int8) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   void CreateFusionInputs(
       ir::Graph* graph,
@@ -269,7 +272,9 @@ class FcXPUFusePass : public FusePassBase {
       BlockDesc* block,
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
       std::map<std::string, Node*>* fusion_nodes_map,
-      bool enable_int8) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   Node* GetNodeFromNodesMap(
       const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
@@ -337,7 +342,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
     std::map<std::string, Node*>* fusion_nodes_map,
     bool with_bias,
     bool with_bn,
-    bool enable_int8) const {
+    std::string op_weights_precision) const {
   // Get Node
   auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
   PADDLE_ENFORCE_EQ(
@@ -449,7 +454,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
       bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
     }
     // recompute the weights
-    if (!enable_int8) {
+    if (op_weights_precision != "int8") {
       float* filter_ptr =
           filter_t->mutable_data<float>(paddle::platform::CPUPlace());
       for (int i = 0; i < mean_len; ++i) {
@@ -495,7 +500,7 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
   Node* filter_intx = nullptr;
   Node* filter_max = nullptr;
   Node* scale_max = nullptr;
-  if (!enable_int8) {
+  if (op_weights_precision != "int8") {
     PrepareWeight<float, int16_t>(graph,
                                   scope,
                                   block,
@@ -561,7 +566,9 @@ void FcXPUFusePass::CreateFusionOutputs(
     BlockDesc* block,
     const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
     std::map<std::string, Node*>* fusion_nodes_map,
-    bool enable_int8) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
   PADDLE_ENFORCE_EQ(
       mul != nullptr,
@@ -617,7 +624,8 @@ void FcXPUFusePass::CreateFusionOutputs(
   (*fusion_nodes_map)["out"] = fc_out_var_node;
 
   // Create out max in
-  if (enable_int8) {
+  if (op_weights_precision == "int8" &&
+      AreScalesPresentForNodes(var_quant_scales, {fc_out_var_node})) {
     std::string fc_out_max_in_name = fc_xpu_out_name + "_max_in";
     int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
     VarDesc fc_out_max_in_desc(fc_out_max_in_name);
@@ -630,24 +638,8 @@ void FcXPUFusePass::CreateFusionOutputs(
     block_out_max_in_desc->SetShape(fc_out_max_in_desc.GetShape());
     block_out_max_in_desc->SetDataType(fc_out_max_in_desc.GetDataType());
 
-    auto GetOutputScale = [&](Node* var_node, std::string name) -> float {
-      int nums_any_ops = var_node->outputs.size();
-      for (size_t i = 0; i < nums_any_ops; ++i) {
-        auto* any_op_desc = fc_out_var_node->outputs[i]->Op();
-        VLOG(1) << "any_op_desc: " << any_op_desc->Type();
-        if (any_op_desc->HasAttr("Input_scale_" + name)) {
-          VLOG(1) << "find it: "
-                  << "Input_scale_" + name;
-          return any_op_desc->GetAttrIfExists<float>("Input_scale_" + name);
-        }
-      }
-      return 0;
-    };
-    float output_scale = GetOutputScale(fc_out_var_node, fc_xpu_out_name);
-    mul->Op()->SetAttr("Input_scale_" + fc_xpu_out_name, output_scale);
-    VLOG(1) << "fc_xpu_out_name:" << fc_xpu_out_name
-            << " output_scale: " << output_scale
-            << "fc_out_var_node name:" << fc_out_var_node->Name();
+    float output_scale =
+        GetScaleValueForNode(var_quant_scales, fc_out_var_node);
     phi::DenseTensor out_max_in_cpu_tensor;
     auto* cpu_ctx = static_cast<phi::CPUContext*>(
         platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
@@ -675,7 +667,9 @@ void FcXPUFusePass::CreateFusionInputs(
     BlockDesc* block,
     const std::map<std::string, std::map<std::string, Node*>>& nodes_map,
     std::map<std::string, Node*>* fusion_nodes_map,
-    bool enable_int8) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   // Get Node
   auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
   PADDLE_ENFORCE_EQ(
@@ -690,9 +684,13 @@ void FcXPUFusePass::CreateFusionInputs(
   // x max
   std::string mul_x_max_name = mul_x->Name() + "_max";
   Node* mul_x_max = nullptr;
-  if (enable_int8) {
-    float input_scale =
-        mul->Op()->GetAttrIfExists<float>("Input_scale_" + mul_x->Name());
+  if (op_weights_precision == "int8") {
+    PADDLE_ENFORCE_EQ(AreScalesPresentForNodes(var_quant_scales, {mul_x}),
+                      true,
+                      platform::errors::InvalidArgument(
+                          "When fc op is running in int8 precision, the scales "
+                          "of input var should be present in!"));
+    float input_scale = GetScaleValueForNode(var_quant_scales, mul_x);
     int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
     VarDesc x_max_desc(mul_x_max_name);
     x_max_desc.SetPersistable(
@@ -729,6 +727,8 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                  with_bn,
                                  act_type);
   auto* scope = param_scope();
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales =
+      GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
@@ -784,10 +784,12 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                                   {"out_max_in", nullptr},
                                                   {"out", nullptr},
                                                   {"out_max", nullptr}};
-
-    bool enable_int8 = mul->Op()->GetAttrIfExists<bool>("enable_int8");
-    std::string op_precision_str = enable_int8 ? "int8" : "fp32";
-    VLOG(1) << "FC fusion fuse pass is running on " << op_precision_str
+    std::string op_weights_precision = "float32";
+    if (mul->Op()->HasAttr("op_weights_precision")) {
+      op_weights_precision =
+          mul->Op()->GetAttrIfExists<std::string>("op_weights_precision");
+    }
+    VLOG(4) << "FC fusion fuse pass is running on " << op_weights_precision
             << " precision!";
     auto* block = mul->Op()->Block();
     CreateFusionWeightsAndBias(graph,
@@ -798,12 +800,21 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                &fusion_nodes_map,
                                with_bias,
                                with_bn,
-                               enable_int8);
-    CreateFusionInputs(
-        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
-    CreateFusionOutputs(
-        graph, scope, block, nodes_map, &fusion_nodes_map, enable_int8);
-    VLOG(1) << "CreateFusionOutputs success!";
+                               op_weights_precision);
+    CreateFusionInputs(graph,
+                       scope,
+                       block,
+                       nodes_map,
+                       &fusion_nodes_map,
+                       op_weights_precision,
+                       &var_quant_scales);
+    CreateFusionOutputs(graph,
+                        scope,
+                        block,
+                        nodes_map,
+                        &fusion_nodes_map,
+                        op_weights_precision,
+                        &var_quant_scales);
 
     // Generate fc_xpu op
     framework::OpDesc fc_xpu_op_desc(block);
@@ -854,22 +865,10 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
             "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
       }
     }
+    fc_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision);
     // out_dtype is same to input precision
     fc_xpu_op_desc.SetAttr("out_dtype",
                            fusion_nodes_map["x"]->Var()->GetDataType());
-    fc_xpu_op_desc.SetAttr("enable_int8",
-                           mul->Op()->GetAttrIfExists<bool>("enable_int8"));
-    if (enable_int8) {
-      fc_xpu_op_desc.SetAttr(
-          "Input_scale_" + fusion_nodes_map["out"]->Name(),
-          mul->Op()->GetAttrIfExists<float>("Input_scale_" +
-                                            fusion_nodes_map["out"]->Name()));
-      fc_xpu_op_desc.SetAttr(
-          "Input_scale_" + fusion_nodes_map["x"]->Name(),
-          mul->Op()->GetAttrIfExists<float>("Input_scale_" +
-                                            fusion_nodes_map["x"]->Name()));
-    }
-
     auto* fc_xpu = graph->CreateOpNode(&fc_xpu_op_desc);
     IR_NODE_LINK_TO(fusion_nodes_map["x"], fc_xpu);
     if (fusion_nodes_map["x_max"]) {
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index d9ab5448d0fda..9fa5893b81666 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -80,9 +80,10 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern,
   auto* fusion_op = pattern->NewNode(fusion_op_repr())
                         ->assert_is_op("conv2d_xpu")
                         ->assert_more([&](Node* node) {
-                          bool enable_int8 =
-                              node->Op()->GetAttrIfExists<bool>("enable_int8");
-                          return !enable_int8;
+                          std::string op_weights_precision =
+                              node->Op()->GetAttrIfExists<std::string>(
+                                  "op_weights_precision");
+                          return op_weights_precision != "int8";
                         });
 
   auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x");
@@ -109,9 +110,10 @@ LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope)
   auto* fusion_op = pattern->NewNode(fusion_op_repr())
                         ->assert_is_op("fc_xpu")
                         ->assert_more([&](Node* node) {
-                          bool enable_int8 =
-                              node->Op()->GetAttrIfExists<bool>("enable_int8");
-                          return !enable_int8;
+                          std::string op_weights_precision =
+                              node->Op()->GetAttrIfExists<std::string>(
+                                  "op_weights_precision");
+                          return op_weights_precision != "int8";
                         });
   auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x");
 
diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
index fff3c4020b544..a06319250a9cb 100644
--- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
@@ -286,36 +286,13 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const {
     desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x"));
     desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y"));
     desc.SetAttr("alpha", 1.0f);
-    if (matmul_v2->Op()->HasAttr("enable_int8")) {
-      desc.SetAttr("enable_int8", matmul_v2->Op()->GetAttr("enable_int8"));
-    }
-    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_x->Name())) {
-      desc.SetAttr("Input_scale_" + matmul_x->Name(),
-                   matmul_v2->Op()->GetAttr("Input_scale_" + matmul_x->Name()));
-    }
-    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_y->Name())) {
-      desc.SetAttr("Input_scale_" + matmul_y->Name(),
-                   matmul_v2->Op()->GetAttr("Input_scale_" + matmul_y->Name()));
-    }
-    if (matmul_v2->Op()->HasAttr("Input_scale_" + matmul_out->Name())) {
-      desc.SetAttr(
-          "Input_scale_" + matmul_out->Name(),
-          matmul_v2->Op()->GetAttr("Input_scale_" + matmul_out->Name()));
+    if (matmul_v2->Op()->HasAttr("op_weights_precision")) {
+      desc.SetAttr("op_weights_precision",
+                   matmul_v2->Op()->GetAttr("op_weights_precision"));
     }
     if (matmul_v2->Op()->HasAttr("weight_scale")) {
       desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale"));
     }
-    if (matmul_v2->Op()->HasAttr("weight_bit_length")) {
-      desc.SetAttr("weight_bit_length",
-                   matmul_v2->Op()->GetAttr("weight_bit_length"));
-    }
-    if (matmul_v2->Op()->HasAttr("weight_quant_axis")) {
-      desc.SetAttr("weight_quant_axis",
-                   matmul_v2->Op()->GetAttr("weight_quant_axis"));
-    }
-    if (matmul_v2->Op()->HasAttr("use_mkldnn")) {
-      desc.SetAttr("use_mkldnn", matmul_v2->Op()->GetAttr("use_mkldnn"));
-    }
     auto matmul_node = graph->CreateOpNode(&desc);
     IR_NODE_LINK_TO(matmul_x, matmul_node);
     IR_NODE_LINK_TO(matmul_y, matmul_node);
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index a8fc7102a8d88..0c2ab8e67f0d0 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -32,29 +32,34 @@ static void UnlinkNodes(ir::Node* a, ir::Node* b) {
                   b->inputs.end());
 }
 
+static void MarkAndLogCannotQuantizeOp(Node* op,
+                                       const char* details = nullptr) {
+  std::stringstream msg_ss;
+  msg_ss << "Cannot quantize operator " << op->Name()
+         << " (type: " << op->Op()->Type() << ", id: " << op->id() << ").";
+  if (details) msg_ss << " " << details;
+  VLOG(2) << msg_ss.str().c_str();
+  op->Op()->SetAttr("xpu_op_calc_data_type", std::string("float32"));
+}
 void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const {
-  GetInfoFromTheTmpOp(
-      graph,
-      "has_quant_info",
-      "var_quant_scales",
-      const_cast<std::unordered_map<std::string, std::vector<float>>*>(
-          &var_quant_scales_));
+  var_quant_scales_ =
+      GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
 }
 
-bool XPUQuantizeOpPass::AreScalesPresentForNodes(
-    std::initializer_list<Node*> nodes) const {
-  bool present = true;
-  for (auto node : nodes) {
-    if (var_quant_scales_.count(node->Name()) == 0) {
-      present = false;
-    }
-  }
-  return present;
-}
+// bool XPUQuantizeOpPass::AreScalesPresentForNodes(
+//     std::initializer_list<Node*> nodes) const {
+//   bool present = true;
+//   for (auto node : nodes) {
+//     if (var_quant_scales_.count(node->Name()) == 0) {
+//       present = false;
+//     }
+//   }
+//   return present;
+// }
 
-float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const {
-  return var_quant_scales_.at(node->Name())[0];
-}
+// float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const {
+//   return var_quant_scales_.at(node->Name())[0];
+// }
 
 void XPUQuantizeOpPass::QuantizeInput(Graph* g,
                                       Node* op,
@@ -78,7 +83,7 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
       proto::VarType::Type::VarType_Type_INT8);
   // Create quantize max_ptr node
 
-  float scale = GetScaleValueForNode(input);
+  float scale = GetScaleValueForNode(&var_quant_scales_, input);
   int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
   std::string input_max_name = input->Name() + "_quantize_max";
   VarDesc input_max_desc(input_max_name);
@@ -144,7 +149,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
       proto::VarType::Type::VarType_Type_INT8);
 
   // Create dequantize max_ptr node
-  float scale = GetScaleValueForNode(output);
+  float scale = GetScaleValueForNode(&var_quant_scales_, output);
   int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
   std::string input_max_name = output->Name() + "_dequantize_max";
   VarDesc input_max_desc(input_max_name);
@@ -221,16 +226,15 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
           out_var_node = output_node;
         }
       }
-      if (!AreScalesPresentForNodes({x_var_node})) {
-        // MarkAndLogCannotQuantizeOp(conv_op,
-        //                        "No scale available for the operator");
+      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) {
+        MarkAndLogCannotQuantizeOp(n, "No scale available for the operator");
         return;
       }
 
       QuantizeInput(graph, n, x_var_node, "x");
       // Branch input
       if (branch_var_node != nullptr) {
-        if (AreScalesPresentForNodes({branch_var_node})) {
+        if (AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node})) {
           QuantizeInput(graph, n, branch_var_node, "branch");
         } else {
           n->Op()->SetAttr("xpu_op_force_output_precision",
@@ -238,7 +242,61 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
         }
       }
 
-      auto has_output_scale = AreScalesPresentForNodes({out_var_node});
+      auto has_output_scale =
+          AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
+      if (has_output_scale) {
+        DequantizeOutput(graph, n, out_var_node, "out");
+        n->Op()->SetAttr(
+            "out_dtype",
+            static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+      } else {
+        n->Op()->SetAttr("xpu_op_force_output_precision",
+                         x_var_node->Var()->GetDataType());
+        n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
+      }
+    }
+  }
+}
+
+void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const {
+  for (auto* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      auto* op = n->Op();
+      if (op->Type() != "fc_xpu") {
+        continue;
+      }
+      Node* w_var_node = nullptr;
+      Node* x_var_node = nullptr;
+      Node* out_var_node = nullptr;
+
+      for (auto* input_node : n->inputs) {
+        if (!input_node->IsVar()) {
+          continue;
+        }
+        if (input_node->Var()->Name() == op->Input("x")[0]) {
+          x_var_node = input_node;
+        } else if (input_node->Var()->Name() == op->Input("w")[0]) {
+          w_var_node = input_node;
+        }
+      }
+
+      for (auto* output_node : n->outputs) {
+        if (!output_node->IsVar()) {
+          continue;
+        }
+        if (output_node->Var()->Name() == op->Output("out")[0]) {
+          out_var_node = output_node;
+        }
+      }
+      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) {
+        MarkAndLogCannotQuantizeOp(n, "No scale available for the operator");
+        return;
+      }
+
+      QuantizeInput(graph, n, x_var_node, "x");
+
+      auto has_output_scale =
+          AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
       if (has_output_scale) {
         DequantizeOutput(graph, n, out_var_node, "out");
         n->Op()->SetAttr(
@@ -266,6 +324,8 @@ void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(1) << "Get quant info from graph success.";
   QuantizeConv(graph);
   VLOG(1) << "Quantize conv of the graph success.";
+  QuantizeFC(graph);
+  VLOG(1) << "Quantize fc of the graph success.";
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
index 0b74682009351..1deb4bebe0dc7 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
@@ -38,6 +38,7 @@ class XPUQuantizeOpPass : public FusePassBase {
  protected:
   void ApplyImpl(Graph* graph) const override;
   void QuantizeConv(Graph* graph) const;
+  void QuantizeFC(Graph* graph) const;
 
  private:
   void QuantizeInput(Graph* g,
@@ -52,11 +53,11 @@ class XPUQuantizeOpPass : public FusePassBase {
 
   void GetQuantInfo(Graph* graph) const;
 
-  bool AreScalesPresentForNodes(std::initializer_list<Node*> nodes) const;
+  // bool AreScalesPresentForNodes(std::initializer_list<Node*> nodes) const;
 
-  float GetScaleValueForNode(Node* node) const;
+  // float GetScaleValueForNode(Node* node) const;
 
-  std::unordered_map<std::string, std::vector<float>> var_quant_scales_;
+  mutable std::unordered_map<std::string, std::vector<float>> var_quant_scales_;
   const std::string name_scope_{"xpu_quantize_op_pass"};
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index ef352712102c4..c1eeb4c1036bd 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 #include "paddle/fluid/framework/ir/constant_folding_pass.h"
-#include "paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/identity_op_clean_pass.h"
@@ -92,9 +91,6 @@ void ConvertToMixedPrecisionPass::Run() {
   LoadModel();
 
   if (backend_ == phi::Backend::XPU) {
-    framework::ir::DeleteQuantDequantLinearOpPass
-        delete_quant_dequant_linear_op_pass;
-    delete_quant_dequant_linear_op_pass.Apply(main_graph_.get());
     framework::ir::DeleteWeightDequantLinearOpPass
         delete_weight_dequant_linear_op_pass;
     delete_weight_dequant_linear_op_pass.Apply(main_graph_.get());

From fdb14aacc0f6666480002c84c0f562cb680b3187 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Fri, 20 Oct 2023 14:05:51 +0800
Subject: [PATCH 05/15] remove op_weights_precision attr

---
 .../delete_weight_dequant_linear_op_pass.cc   | 11 +--
 .../ir/quantize_related_pass_utils.h          | 68 +++----------------
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 18 +++--
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 27 +++++---
 .../framework/ir/xpu/link_xpu_op_max_pass.cc  | 63 ++++++++++-------
 .../framework/ir/xpu/link_xpu_op_max_pass.h   |  1 +
 .../ir/xpu/reshape2_matmul_xpu_fuse_pass.cc   |  7 --
 7 files changed, 84 insertions(+), 111 deletions(-)

diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 968120068b92a..52b4a8fce8c12 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
 
 #include "glog/logging.h"
 
@@ -40,6 +41,7 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   bool is_int8 = false;
 
   std::unordered_set<const Node*> nodes2rm;
+  std::unordered_map<std::string, std::vector<float>> var_quant_scales{};
 
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
@@ -150,10 +152,9 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                             weight_scale_nums));
                     calcu_op_desc->SetAttr("weight_scale", weight_scale);
                   }
-                  if (bit_length == 8) {
-                    // Current 8-bit quantization only supports int8
-                    calcu_op_desc->SetAttr("op_weights_precision",
-                                           std::string("int8"));
+                  if (!var_quant_scales.count(weight_var_node->Var()->Name())) {
+                    var_quant_scales.insert(std::make_pair(
+                        weight_var_node->Var()->Name(), weight_scale));
                   }
 
                   calcu_op_desc->RenameInput(
@@ -180,6 +181,8 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
   }
 
   GraphSafeRemoveNodes(graph, nodes2rm);
+  SaveQuantInfoInTheGraph(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales);
   graph->Set("enable_int8", new bool(is_int8));
 }
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
index d6c54cef47d90..f8c6358dfcdb1 100644
--- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h
+++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
@@ -23,24 +23,6 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-static inline void SaveInfoInTheTmpOp(
-    ir::Graph* graph,
-    const std::string& flag,
-    const std::string& key_suffix,
-    const std::unordered_map<std::string, std::vector<float>>& info_map) {
-  VLOG(3) << "save variables in the first op's attr";
-
-  const std::string suffix = "_" + key_suffix + "_" + flag;
-  OpDesc op_desc;
-  op_desc.SetType("save");
-  auto* op_node = graph->CreateOpNode(&op_desc);
-
-  op_node->Op()->SetAttr(flag, true);
-  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
-    op_node->Op()->SetAttr(iter->first + suffix, iter->second);
-  }
-}
-
 static inline void SaveQuantInfoInTheGraph(
     ir::Graph* graph,
     const std::string& flag,
@@ -48,53 +30,15 @@ static inline void SaveQuantInfoInTheGraph(
     const std::unordered_map<std::string, std::vector<float>>& info_map) {
   VLOG(1) << "Save quant info in the graph!";
   const std::string suffix = "_" + key_suffix + "_" + flag;
-  graph->Set(flag, new bool(true));
+  if (!graph->Has(flag)) {
+    graph->Set(flag, new bool(true));
+  }
   for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
     VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix;
     graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
   }
 }
 
-static void GetInfoFromTheTmpOp(
-    ir::Graph* graph,
-    const std::string& flag,
-    const std::string& key_suffix,
-    std::unordered_map<std::string, std::vector<float>>* info_map) {
-  VLOG(3) << "get variables from the first op's attr";
-
-  const std::string suffix = "_" + key_suffix + "_" + flag;
-  for (auto* op_node :
-       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
-    if (!op_node->IsOp() || op_node->Op()->Type() != "save") continue;
-    VLOG(1) << "Come in save op";
-    auto* op_desc = op_node->Op();
-    if (op_desc->GetAttrIfExists<bool>(flag)) {
-      VLOG(1) << "flag is true";
-      op_desc->RemoveAttr(flag);
-      std::vector<std::string> attr_names = op_desc->AttrNames();
-      VLOG(1) << "attr_names size:" << attr_names.size();
-      for (auto fake_name : attr_names) {
-        VLOG(1) << "fake_name:" << fake_name;
-        size_t pos = fake_name.find(suffix);
-        if (pos != std::string::npos) {
-          std::string name = fake_name.substr(0, pos);
-          VLOG(1) << "name:" << name;
-          auto scales_vector =
-              PADDLE_GET_CONST(std::vector<float>, op_desc->GetAttr(fake_name));
-          VLOG(1) << "scales_vector:" << scales_vector[0];
-          info_map->insert(std::make_pair(name, scales_vector));
-          VLOG(1) << "insert success:";
-          op_desc->RemoveAttr(fake_name);
-          VLOG(1) << "remove success:";
-        }
-      }
-      graph->RemoveNode(op_node);
-      VLOG(1) << "remove op node success:";
-      break;
-    }
-  }
-}
-
 static inline void GetQuantInfoFromTheGraph(
     ir::Graph* graph,
     const std::string& flag,
@@ -164,6 +108,12 @@ static inline float GetScaleValueForNode(
   return var_quant_scales->at(node->Name())[0];
 }
 
+static inline std::vector<float> GetScaleVecValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node) {
+  return var_quant_scales->at(node->Name());
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 0605f2355ce2b..b85ff63131e0b 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -371,7 +371,9 @@ class Conv2dXPUFusePass : public FusePassBase {
       bool with_conv_bias,
       bool with_bn,
       bool with_scale,
-      std::string op_weights_precision) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   void CreateFusionInputs(
       ir::Graph* graph,
@@ -482,7 +484,9 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
     bool with_conv_bias,
     bool with_bn,
     bool with_scale,
-    std::string op_weights_precision) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   // Get Node
   auto* conv = GetNodeFromNodesMap(nodes_map, "conv", "conv");
   PADDLE_ENFORCE_EQ(
@@ -505,8 +509,10 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   }
 
   // Get Weight scale in int8 scene
-  std::vector<float> weight_scale =
-      conv->Op()->GetAttrIfExists<std::vector<float>>("weight_scale");
+  std::vector<float> weight_scale{};
+  if (AreScalesPresentForNodes(var_quant_scales, {conv_filter})) {
+    weight_scale = GetScaleVecValueForNode(var_quant_scales, conv_filter);
+  }
   // Create fusion_bias_node
   auto filter_dims = filter_t->dims();
   bool has_bias = with_bn || with_conv_bias;
@@ -1094,7 +1100,8 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                with_conv_bias,
                                with_bn,
                                with_scale,
-                               op_weights_precision);
+                               op_weights_precision,
+                               &var_quant_scales);
     CreateFusionInputs(graph,
                        scope,
                        block,
@@ -1199,7 +1206,6 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
     // out_dtype is same to input precision
     conv2d_xpu_op_desc.SetAttr("out_dtype",
                                fusion_nodes_map["x"]->Var()->GetDataType());
-    conv2d_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision);
 
     // Link node
     auto* conv2d_xpu = graph->CreateOpNode(&conv2d_xpu_op_desc);
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index cb007fd435178..4e8a6d9d99c73 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -254,7 +254,9 @@ class FcXPUFusePass : public FusePassBase {
       std::map<std::string, Node*>* fusion_nodes_map,
       bool with_bias,
       bool with_bn,
-      std::string op_weights_precision) const;
+      std::string op_weights_precision,
+      std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+      const;
 
   void CreateFusionOutputs(
       ir::Graph* graph,
@@ -342,7 +344,9 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
     std::map<std::string, Node*>* fusion_nodes_map,
     bool with_bias,
     bool with_bn,
-    std::string op_weights_precision) const {
+    std::string op_weights_precision,
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales)
+    const {
   // Get Node
   auto* mul = GetNodeFromNodesMap(nodes_map, "mul", "mul");
   PADDLE_ENFORCE_EQ(
@@ -371,8 +375,10 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
     transpose_w = PADDLE_GET_CONST(bool, mul->Op()->GetAttr("trans_y"));
   }
   // Get Weight scale in int8 scene
-  std::vector<float> weight_scale =
-      mul->Op()->GetAttrIfExists<std::vector<float>>("weight_scale");
+  std::vector<float> weight_scale{};
+  if (AreScalesPresentForNodes(var_quant_scales, {mul_w})) {
+    weight_scale = GetScaleVecValueForNode(var_quant_scales, mul_w);
+  }
   // Create fusion_bias_node
   auto filter_dims = filter_t->dims();
   bool has_bias = with_bn || with_bias;
@@ -784,10 +790,13 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                                   {"out_max_in", nullptr},
                                                   {"out", nullptr},
                                                   {"out_max", nullptr}};
+    auto filter_data_type =
+        scope->FindVar(mul_w->Name())->GetMutable<phi::DenseTensor>()->dtype();
     std::string op_weights_precision = "float32";
-    if (mul->Op()->HasAttr("op_weights_precision")) {
-      op_weights_precision =
-          mul->Op()->GetAttrIfExists<std::string>("op_weights_precision");
+    if (filter_data_type == phi::DataType::INT8) {
+      op_weights_precision = "int8";
+    } else if (filter_data_type == phi::DataType::FLOAT16) {
+      op_weights_precision = "float16";
     }
     VLOG(4) << "FC fusion fuse pass is running on " << op_weights_precision
             << " precision!";
@@ -800,7 +809,8 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
                                &fusion_nodes_map,
                                with_bias,
                                with_bn,
-                               op_weights_precision);
+                               op_weights_precision,
+                               &var_quant_scales);
     CreateFusionInputs(graph,
                        scope,
                        block,
@@ -865,7 +875,6 @@ int FcXPUFusePass::ApplyImpl(ir::Graph* graph,
             "act_alpha", PADDLE_GET_CONST(float, act->Op()->GetAttr("slope")));
       }
     }
-    fc_xpu_op_desc.SetAttr("op_weights_precision", op_weights_precision);
     // out_dtype is same to input precision
     fc_xpu_op_desc.SetAttr("out_dtype",
                            fusion_nodes_map["x"]->Var()->GetDataType());
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index 9fa5893b81666..bf03a2598726c 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -67,6 +67,7 @@ struct LinkConv2dPattern : public PatternBase {
   PATTERN_DECL_NODE(fusion_op);
   // declare variable node's name
   PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(filter);
   PATTERN_DECL_NODE(branch);
 
  private:
@@ -77,23 +78,21 @@ LinkConv2dPattern::LinkConv2dPattern(PDPattern* pattern,
                                      const std::string& name_scope,
                                      bool with_branch)
     : PatternBase(pattern, name_scope, name_scope), with_branch_(with_branch) {
-  auto* fusion_op = pattern->NewNode(fusion_op_repr())
-                        ->assert_is_op("conv2d_xpu")
-                        ->assert_more([&](Node* node) {
-                          std::string op_weights_precision =
-                              node->Op()->GetAttrIfExists<std::string>(
-                                  "op_weights_precision");
-                          return op_weights_precision != "int8";
-                        });
+  auto* fusion_op =
+      pattern->NewNode(fusion_op_repr())->assert_is_op("conv2d_xpu");
 
   auto* x = pattern->NewNode(x_repr())->assert_is_op_input("conv2d_xpu", "x");
+  auto* filter = pattern->NewNode(filter_repr())
+                     ->assert_is_op_input("conv2d_xpu", "filter")
+                     ->assert_is_persistable_var();
   PDNode* branch = nullptr;
   if (with_branch_) {
     branch = pattern->NewNode(branch_repr())
                  ->assert_is_op_input("conv2d_xpu", "branch");
-    fusion_op->LinksFrom({branch});
+    fusion_op->LinksFrom({x, branch, filter});
+  } else {
+    fusion_op->LinksFrom({x, filter});
   }
-  fusion_op->LinksFrom({x});
 }
 
 struct LinkFcPattern : public PatternBase {
@@ -103,25 +102,30 @@ struct LinkFcPattern : public PatternBase {
   PATTERN_DECL_NODE(fusion_op);
   // declare variable node's name
   PATTERN_DECL_NODE(x);
+  PATTERN_DECL_NODE(w);
 };
 
 LinkFcPattern::LinkFcPattern(PDPattern* pattern, const std::string& name_scope)
     : PatternBase(pattern, name_scope, name_scope) {
-  auto* fusion_op = pattern->NewNode(fusion_op_repr())
-                        ->assert_is_op("fc_xpu")
-                        ->assert_more([&](Node* node) {
-                          std::string op_weights_precision =
-                              node->Op()->GetAttrIfExists<std::string>(
-                                  "op_weights_precision");
-                          return op_weights_precision != "int8";
-                        });
-  auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x");
+  auto* fusion_op = pattern->NewNode(fusion_op_repr())->assert_is_op("fc_xpu");
 
-  fusion_op->LinksFrom({x});
+  auto* x = pattern->NewNode(x_repr())->assert_is_op_input("fc_xpu", "x");
+  auto* w = pattern->NewNode(w_repr())
+                ->assert_is_op_input("fc_xpu", "w")
+                ->assert_is_persistable_var();
+  fusion_op->LinksFrom({x, w});
 }
 
 }  // namespace patterns
 
+bool LinkXPUOpMaxPass::IsQuant(Node* weight_node) const {
+  auto w_dtype = param_scope()
+                     ->FindVar(weight_node->Name())
+                     ->GetMutable<phi::DenseTensor>()
+                     ->dtype();
+  return w_dtype == phi::DataType::INT8;
+}
+
 void LinkXPUOpMaxPass::LinkAddActMax(ir::Graph* graph) const {
   GraphPatternDetector gpd;
   patterns::LinkAddActPattern pattern(gpd.mutable_pattern(), name_scope_);
@@ -168,16 +172,20 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const {
   GraphPatternDetector gpd;
   patterns::LinkConv2dPattern pattern(
       gpd.mutable_pattern(), name_scope_, with_branch);
+  auto* scope = param_scope();
   int found_subgraph_count = 0;
-
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle LinkConv2dMax";
-    /* declare operator node's name */
+    /* get operator node's name */
     GET_IR_NODE(fusion_op);
-    /* declare variable node's name*/
+    /* get variable node's name*/
     GET_IR_NODE(x);
+    GET_IR_NODE(filter);
     GET_IR_NODE(branch);
+    if (IsQuant(filter)) {
+      return;
+    }
     auto* fusion_op_desc = fusion_op->Op();
     bool fusion_op_has_branch = fusion_op_desc->HasInput("branch");
     if (fusion_op_has_branch) {
@@ -224,14 +232,17 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const {
   GraphPatternDetector gpd;
   patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_);
   int found_subgraph_count = 0;
-
+  auto* scope = param_scope();
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle LinkFcMax";
-    /* declare operator node's name */
+    /* get operator node's name */
     GET_IR_NODE(fusion_op);
-    /* declare variable node's name*/
+    /* get variable node's name*/
     GET_IR_NODE(x);
+    GET_IR_NODE(w);
+
+    if (IsQuant(w)) return;
     auto* fusion_op_desc = fusion_op->Op();
     auto* x_pre_op = x->inputs[0]->Op();
     if (x->inputs.size() > 0 && x->inputs[0]->IsOp() &&
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h
index cad199ce573bb..a71a2e19cf430 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.h
@@ -102,6 +102,7 @@ Fused subgraph:
   */
   void LinkAddActMax(ir::Graph* graph) const;
 
+  bool IsQuant(Node* weight_node) const;
   const std::string name_scope_{"link_xpu_op_max_pass"};
 };
 
diff --git a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
index a06319250a9cb..8fa4a377175a7 100644
--- a/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/reshape2_matmul_xpu_fuse_pass.cc
@@ -286,13 +286,6 @@ void MapMatmulV2ToMatmulXPUPass::MapMatmulV2ToMatmul(ir::Graph* graph) const {
     desc.SetAttr("transpose_X", matmul_v2->Op()->GetAttr("trans_x"));
     desc.SetAttr("transpose_Y", matmul_v2->Op()->GetAttr("trans_y"));
     desc.SetAttr("alpha", 1.0f);
-    if (matmul_v2->Op()->HasAttr("op_weights_precision")) {
-      desc.SetAttr("op_weights_precision",
-                   matmul_v2->Op()->GetAttr("op_weights_precision"));
-    }
-    if (matmul_v2->Op()->HasAttr("weight_scale")) {
-      desc.SetAttr("weight_scale", matmul_v2->Op()->GetAttr("weight_scale"));
-    }
     auto matmul_node = graph->CreateOpNode(&desc);
     IR_NODE_LINK_TO(matmul_x, matmul_node);
     IR_NODE_LINK_TO(matmul_y, matmul_node);

From c30a50c06635305f4a951b48308ed468fe538c8b Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Fri, 20 Oct 2023 18:13:43 +0800
Subject: [PATCH 06/15] support fp16 quantize model

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 -
 .../framework/ir/constant_folding_pass.cc     |   3 +-
 .../delete_weight_dequant_linear_op_pass.cc   |   2 -
 paddle/fluid/framework/ir/graph.h             |   7 -
 .../auto_trans_quantize_op_precision_pass.cc  | 128 --------
 .../xpu/cast_mixed_precision_op_fuse_pass.cc  |  44 ++-
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  |  10 +-
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  |  65 ++--
 .../framework/ir/xpu/xpu_quantize_op_pass.h   |   4 -
 .../ir/xpu/xpu_quantize_squash_pass.cc        |  25 --
 .../passes/convert_to_mixed_precision.cc      |   7 -
 .../kernels/fusion/xpu/conv2d_xpu_kernel.cc   | 285 +-----------------
 .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc   |   2 +-
 13 files changed, 90 insertions(+), 494 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 42e9a1267e0ee..e41e7d23cd594 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -253,8 +253,6 @@ if(WITH_XPU)
   pass_library(conv2d_bias_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(xpu_quantize_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(xpu_quantize_squash_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
-  pass_library(auto_trans_quantize_op_precision_pass inference DIR xpu DEPS
-               ${XPU_PASS_DEPS})
   pass_library(redundant_unsqueeze_squeeze_elimination_pass inference DIR xpu
                DEPS ${XPU_PASS_DEPS})
   pass_library(redundant_squeeze_unsqueeze_elimination_pass inference DIR xpu
diff --git a/paddle/fluid/framework/ir/constant_folding_pass.cc b/paddle/fluid/framework/ir/constant_folding_pass.cc
index 3b3f23933fb6d..ffd6783616052 100644
--- a/paddle/fluid/framework/ir/constant_folding_pass.cc
+++ b/paddle/fluid/framework/ir/constant_folding_pass.cc
@@ -64,7 +64,8 @@ void ConstantFoldingPass::ApplyImpl(ir::Graph *graph) const {
       platform::errors::Fatal(
           "scope must not be null when applying constant folding."));
 
-  std::vector<std::string> blacklist{"feed", "matrix_multiply", "save"};
+  std::vector<std::string> blacklist{
+      "feed", "matrix_multiply", "save", "dequantize_linear"};
   int folded_op_num = 0;
 
   auto op_node_sorted = framework::ir::TopologyVarientSort(
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 52b4a8fce8c12..59f25483c110b 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -117,8 +117,6 @@ void DeleteWeightDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
                         weight_scale_tensor->dtype()));
                   }
 
-                  int bit_length =
-                      PADDLE_GET_CONST(int, op->GetAttr("bit_length"));
                   int quant_axis =
                       PADDLE_GET_CONST(int, op->GetAttr("quant_axis"));
                   if (quant_axis == -1) {  // per_layer quant_dequant: all OP
diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h
index e29e5a2a9a9d2..e42334aac0593 100644
--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -177,11 +177,7 @@ class Graph {
         platform::errors::AlreadyExists(
             "The attribute %s to be set already exists in the graph.",
             attr_name));
-    VLOG(1) << "set attribute " << attr_name;
     attrs_[attr_name] = attr;
-    VLOG(1) << "attrs_ size " << attrs_.size();
-    std::vector<std::string> attr_names = AttrNames();
-    VLOG(1) << "attr_names size " << attr_names.size();
     attr_dels_[attr_name] = [attr, attr_name]() {
       VLOG(3) << "deleting " << attr_name;
       delete attr;
@@ -417,7 +413,6 @@ class Graph {
   }
 
   std::vector<std::string> AttrNames() const {
-    VLOG(1) << "graph addr:" << this;
     if (FLAGS_convert_all_blocks) {
       if (IsMainGraph()) {
         return GetSubGraph(0)->AttrNames();
@@ -425,10 +420,8 @@ class Graph {
     }
     std::vector<std::string> res;
     res.reserve(attrs_.size());
-    VLOG(1) << "AttrNames attr size: " << attrs_.size();
     for (auto &attr : attrs_) {
       res.push_back(attr.first);
-      VLOG(1) << "AttrNames: " << attr.first;
     }
     return res;
   }
diff --git a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc b/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
deleted file mode 100644
index 9fec1091bd9a9..0000000000000
--- a/paddle/fluid/framework/ir/xpu/auto_trans_quantize_op_precision_pass.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <string>
-
-#include "glog/logging.h"
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class AutoTransQuantizeOpPrecisionPass : public FusePassBase {
- protected:
-  void ApplyImpl(ir::Graph* graph) const override;
-  void FirstRound(ir::Graph* graph) const;
-
-  const std::string name_scope_{"auto_trans_quantize_op_precision_pass"};
-  const std::unordered_set<std::string> support_fusion_quant_op_type_{
-      "conv2d_xpu", "fc_xpu"};
-};
-
-static inline Node* GetOpOutVarNodeByArgsName(ir::Graph* graph,
-                                              Node* op_node,
-                                              const std::string& arg_name) {
-  CHECK_EQ(op_node->IsOp(), true);
-  auto* op_desc = op_node->Op();
-  auto out_var_nodes = op_desc->Output(arg_name);
-  CHECK_EQ(out_var_nodes.size(), 1UL);
-  auto out_var_name = out_var_nodes[0];
-  auto out_var_node = FindNodeWithName(graph, out_var_name);
-  return out_var_node;
-}
-
-void AutoTransQuantizeOpPrecisionPass::FirstRound(ir::Graph* graph) const {
-  auto graph_size = graph->SubGraphsSize();
-  VLOG(1) << "There is " << graph_size << " subgraphs need to be handle.";
-  for (size_t i = 0; i < graph_size; i++) {
-    auto subgraph = graph->GetSubGraph(i);
-    VLOG(1) << "Handling the subgraph id: " << i;
-    for (auto* op_node : TopologySortOperations(*subgraph)) {
-      auto op_type = op_node->Op()->Type();
-      if (support_fusion_quant_op_type_.find(op_type) !=
-          support_fusion_quant_op_type_.end()) {
-        bool enable_int8 = op_node->Op()->GetAttrIfExists<bool>("enable_int8");
-        int out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
-        if (enable_int8) {
-          auto* out_var_node =
-              GetOpOutVarNodeByArgsName(subgraph, op_node, "out");
-          PADDLE_ENFORCE_NOT_NULL(
-              out_var_node,
-              platform::errors::InvalidArgument(
-                  "out_var_node in graph cannot be nullptr."));
-          bool is_int8_out = true;
-          for (auto* next_op_node : out_var_node->outputs) {
-            auto next_op_type = next_op_node->Op()->Type();
-            bool is_next_op_support_int8 =
-                next_op_node->Op()->GetAttrIfExists<bool>("enable_int8") &&
-                ((support_fusion_quant_op_type_.find(next_op_type) !=
-                  support_fusion_quant_op_type_.end()));
-            if (!is_next_op_support_int8) {
-              is_int8_out = false;
-              break;
-            }
-          }
-          if (is_int8_out) {
-            op_node->Op()->SetAttr(
-                "out_dtype",
-                static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
-            out_var_node->Var()->SetDataType(
-                proto::VarType::Type::VarType_Type_INT8);
-            VLOG(1) << "The out var node " << out_var_node->Name()
-                    << " is INT8";
-          }
-        }
-      }
-    }
-  }
-}
-
-void AutoTransQuantizeOpPrecisionPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::PreconditionNotMet("graph should not be null."));
-  Init(name_scope_, graph);
-  VLOG(1) << "AutoTransQuantizeOpPrecisionPass handling start ...";
-  FirstRound(graph);
-  VLOG(1) << "AutoTransQuantizeOpPrecisionPass handleing end.";
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(auto_trans_quantize_op_precision_pass,
-              paddle::framework::ir::AutoTransQuantizeOpPrecisionPass);
-
-REGISTER_PASS_CAPABILITY(auto_trans_quantize_op_precision_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("fc_xpu", 0)
-            .EQ("conv2d_xpu", 0));
diff --git a/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc
index ef8759153b0cc..1a56e4d660431 100644
--- a/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/cast_mixed_precision_op_fuse_pass.cc
@@ -127,6 +127,7 @@ int CastMixedPrecisionOpFusePass::ApplyCastBeforePass(
   GraphPatternDetector gpd;
   patterns::CastBeforePattern pattern(
       gpd.mutable_pattern(), name_scope_, mixed_precision_op_type);
+  auto* scope = param_scope();
   int found_subgraph_count = 0;
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -136,7 +137,22 @@ int CastMixedPrecisionOpFusePass::ApplyCastBeforePass(
     GET_IR_NODE(cast);
     GET_IR_NODE(cast_out);
     GET_IR_NODE(mixed_precision_op);
-
+    // Note: conv2d_xpu/fc_xpu not support float32/int8/float16, can not fuse.
+    if (mixed_precision_op_type == "conv2d_xpu") {
+      auto filter_name = mixed_precision_op->Op()->Input("filter")[0];
+      auto filter_data_type =
+          scope->FindVar(filter_name)->GetMutable<phi::DenseTensor>()->dtype();
+      if (filter_data_type == phi::DataType::INT8) {
+        return;
+      }
+    } else if (mixed_precision_op_type == "fc_xpu") {
+      auto w_name = mixed_precision_op->Op()->Input("w")[0];
+      auto w_data_type =
+          scope->FindVar(w_name)->GetMutable<phi::DenseTensor>()->dtype();
+      if (w_data_type == phi::DataType::INT8) {
+        return;
+      }
+    }
     mixed_precision_op->Op()->RenameInput(cast_out->Name(), cast_in->Name());
     IR_NODE_LINK_TO(cast_in, mixed_precision_op);
 
@@ -155,6 +171,7 @@ int CastMixedPrecisionOpFusePass::ApplyCastAfterPass(
   GraphPatternDetector gpd;
   patterns::CastAfterPattern pattern(
       gpd.mutable_pattern(), name_scope_, mixed_precision_op_type);
+  auto* scope = param_scope();
   int found_subgraph_count = 0;
 
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
@@ -164,7 +181,30 @@ int CastMixedPrecisionOpFusePass::ApplyCastAfterPass(
     GET_IR_NODE(cast_in);
     GET_IR_NODE(cast);
     GET_IR_NODE(cast_out);
-
+    // Note: conv2d_xpu/fc_xpu not support float16/int8/float32, can not fuse.
+    if (mixed_precision_op_type == "conv2d_xpu") {
+      auto filter_name = mixed_precision_op->Op()->Input("filter")[0];
+      auto filter_data_type =
+          scope->FindVar(filter_name)->GetMutable<phi::DenseTensor>()->dtype();
+      auto x_name = mixed_precision_op->Op()->Input("x")[0];
+      auto* x_node = FindNodeWithName(graph, x_name);
+      if (filter_data_type == phi::DataType::INT8 &&
+          x_node->Var()->GetDataType() ==
+              proto::VarType::Type::VarType_Type_FP16) {
+        return;
+      }
+    } else if (mixed_precision_op_type == "fc_xpu") {
+      auto w_name = mixed_precision_op->Op()->Input("w")[0];
+      auto w_data_type =
+          scope->FindVar(w_name)->GetMutable<phi::DenseTensor>()->dtype();
+      auto x_name = mixed_precision_op->Op()->Input("x")[0];
+      auto* x_node = FindNodeWithName(graph, x_name);
+      if (w_data_type == phi::DataType::INT8 &&
+          x_node->Var()->GetDataType() ==
+              proto::VarType::Type::VarType_Type_FP16) {
+        return;
+      }
+    }
     mixed_precision_op->Op()->RenameOutput(cast_in->Name(), cast_out->Name());
     int out_dtype = proto::VarType::Type::VarType_Type_FP32;
     mixed_precision_op->Op()->SetAttr("out_dtype", out_dtype);
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index b85ff63131e0b..2ce255e81707a 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -1084,10 +1084,14 @@ int Conv2dXPUFusePass::ApplyImpl(ir::Graph* graph,
                                                   {"out", nullptr},
                                                   {"out_max", nullptr}};
 
+    auto filter_data_type = scope->FindVar(conv_filter->Name())
+                                ->GetMutable<phi::DenseTensor>()
+                                ->dtype();
     std::string op_weights_precision = "float32";
-    if (conv->Op()->HasAttr("op_weights_precision")) {
-      op_weights_precision =
-          conv->Op()->GetAttrIfExists<std::string>("op_weights_precision");
+    if (filter_data_type == phi::DataType::INT8) {
+      op_weights_precision = "int8";
+    } else if (filter_data_type == phi::DataType::FLOAT16) {
+      op_weights_precision = "float16";
     }
     VLOG(4) << "Conv2d fusion fuse pass is running on " << op_weights_precision
             << " precision!";
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index 0c2ab8e67f0d0..49852ec26311d 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -39,28 +39,12 @@ static void MarkAndLogCannotQuantizeOp(Node* op,
          << " (type: " << op->Op()->Type() << ", id: " << op->id() << ").";
   if (details) msg_ss << " " << details;
   VLOG(2) << msg_ss.str().c_str();
-  op->Op()->SetAttr("xpu_op_calc_data_type", std::string("float32"));
 }
 void XPUQuantizeOpPass::GetQuantInfo(Graph* graph) const {
   var_quant_scales_ =
       GetQuantInfoFromTheGraph(graph, "has_quant_info", "var_quant_scales");
 }
 
-// bool XPUQuantizeOpPass::AreScalesPresentForNodes(
-//     std::initializer_list<Node*> nodes) const {
-//   bool present = true;
-//   for (auto node : nodes) {
-//     if (var_quant_scales_.count(node->Name()) == 0) {
-//       present = false;
-//     }
-//   }
-//   return present;
-// }
-
-// float XPUQuantizeOpPass::GetScaleValueForNode(Node* node) const {
-//   return var_quant_scales_.at(node->Name())[0];
-// }
-
 void XPUQuantizeOpPass::QuantizeInput(Graph* g,
                                       Node* op,
                                       Node* input,
@@ -232,27 +216,37 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
       }
 
       QuantizeInput(graph, n, x_var_node, "x");
-      // Branch input
-      if (branch_var_node != nullptr) {
-        if (AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node})) {
+      auto has_output_scale =
+          AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
+      bool has_branch = branch_var_node != nullptr;
+
+      // Note: Conv2d fusion requres branch datatype is same as output datatype,
+      // so we should consider branch/output together.
+      if (has_branch) {
+        bool has_branch_scale =
+            AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node});
+        if (has_output_scale && has_branch_scale) {
           QuantizeInput(graph, n, branch_var_node, "branch");
+          DequantizeOutput(graph, n, out_var_node, "out");
+          // Note: out_dtype attr must be set, because if dequantize_output, we
+          // consider the kernel out_dtype as int8.
+          n->Op()->SetAttr(
+              "out_dtype",
+              static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
         } else {
-          n->Op()->SetAttr("xpu_op_force_output_precision",
-                           branch_var_node->Var()->GetDataType());
+          n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
         }
-      }
-
-      auto has_output_scale =
-          AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
-      if (has_output_scale) {
-        DequantizeOutput(graph, n, out_var_node, "out");
-        n->Op()->SetAttr(
-            "out_dtype",
-            static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
       } else {
-        n->Op()->SetAttr("xpu_op_force_output_precision",
-                         x_var_node->Var()->GetDataType());
-        n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
+        if (has_output_scale) {
+          DequantizeOutput(graph, n, out_var_node, "out");
+          // Note: out_dtype attr must be set, because if dequantize_output, we
+          // consider the kernel out_dtype as int8.
+          n->Op()->SetAttr(
+              "out_dtype",
+              static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
+        } else {
+          n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
+        }
       }
     }
   }
@@ -303,8 +297,6 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const {
             "out_dtype",
             static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
       } else {
-        n->Op()->SetAttr("xpu_op_force_output_precision",
-                         x_var_node->Var()->GetDataType());
         n->Op()->SetAttr("out_dtype", x_var_node->Var()->GetDataType());
       }
     }
@@ -321,11 +313,8 @@ void XPUQuantizeOpPass::ApplyImpl(ir::Graph* graph) const {
       platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
   GetQuantInfo(graph);
-  VLOG(1) << "Get quant info from graph success.";
   QuantizeConv(graph);
-  VLOG(1) << "Quantize conv of the graph success.";
   QuantizeFC(graph);
-  VLOG(1) << "Quantize fc of the graph success.";
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
index 1deb4bebe0dc7..28d0f42e76bde 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.h
@@ -53,10 +53,6 @@ class XPUQuantizeOpPass : public FusePassBase {
 
   void GetQuantInfo(Graph* graph) const;
 
-  // bool AreScalesPresentForNodes(std::initializer_list<Node*> nodes) const;
-
-  // float GetScaleValueForNode(Node* node) const;
-
   mutable std::unordered_map<std::string, std::vector<float>> var_quant_scales_;
   const std::string name_scope_{"xpu_quantize_op_pass"};
 };
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
index 8571dee220d3b..3f25be65b3c70 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -104,7 +104,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
                      dequant_in->Name());
         next_op_desc->SetInput(input_name, input_names);
       }
-
       if (keep_dequant)
         GraphSafeRemoveNodes(graph, {quant_op, quant_out});
       else
@@ -114,30 +113,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
       IR_NODE_LINK_TO(dequant_in, next_op);
 
       found_dequant_quant_count++;
-    } else {
-      // squash dequantize-quantize to requantize op
-      //   OpDesc desc;
-      //   desc.SetType("requantize");
-      //   desc.SetInput("Input",
-      //   std::vector<std::string>({dequant_in->Name()}));
-      //   desc.SetOutput("Output",
-      //   std::vector<std::string>({quant_out->Name()}));
-      //   desc.SetAttr("Scale_in", dequant_scale);
-      //   desc.SetAttr("Shift_in", dequant_shift);
-      //   desc.SetAttr("Scale_out", quant_scale);
-      //   desc.SetAttr("Shift_out", quant_shift);
-
-      //   auto requant_op = g->CreateOpNode(&desc);
-
-      //   if (keep_dequant)
-      //     GraphSafeRemoveNodes(graph, {quant_op});
-      //   else
-      //     GraphSafeRemoveNodes(graph, {dequant_op, quant_op, dequant_out});
-
-      //   IR_NODE_LINK_TO(dequant_in, requant_op);
-      //   IR_NODE_LINK_TO(requant_op, quant_out);
-
-      //   found_dequant_quant_count++;
     }
   };
   gpd(graph, handler);
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index c1eeb4c1036bd..3aeeff498a52f 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -17,7 +17,6 @@
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 #include "paddle/fluid/framework/ir/constant_folding_pass.h"
-#include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/ir/identity_op_clean_pass.h"
 #include "paddle/fluid/inference/io.h"
@@ -90,12 +89,6 @@ void ConvertToMixedPrecisionPass::LoadModel() {
 void ConvertToMixedPrecisionPass::Run() {
   LoadModel();
 
-  if (backend_ == phi::Backend::XPU) {
-    framework::ir::DeleteWeightDequantLinearOpPass
-        delete_weight_dequant_linear_op_pass;
-    delete_weight_dequant_linear_op_pass.Apply(main_graph_.get());
-  }
-
   framework::ir::ConstantFoldingPass constant_folding_pass;
   constant_folding_pass.Apply(main_graph_.get());
 
diff --git a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
index 9dce663de72c7..6ba3d84b5eb0b 100644
--- a/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/conv2d_xpu_kernel.cc
@@ -71,12 +71,9 @@ void Conv2dXPUKernelImpl(const Context& ctx,
   int out_c = static_cast<int>(filter_dims[0]);
   int win_h = static_cast<int>(filter_dims[2]);
   int win_w = static_cast<int>(filter_dims[3]);
-  VLOG(1) << "KERNEL1";
   auto* input_data = reinterpret_cast<const XPUTypeX*>(x.data<T_X>());
-  VLOG(1) << "KERNEL1.5";
   const float* input_max_data =
       x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data<float>();
-  VLOG(1) << "KERNEL2";
   auto* filter_data = reinterpret_cast<const XPUTypeW*>(filter.data<T_W>());
   auto* filter_max_data = filter_max.data<float>();
   auto* scale_max_data = scale_max.get_ptr() == nullptr
@@ -94,134 +91,18 @@ void Conv2dXPUKernelImpl(const Context& ctx,
       branch_data =
           reinterpret_cast<const XPUTypeOut*>(branch_tensor->data<T_OUT>());
     } else {
-      if (branch_tensor->dtype() == phi::DataType::FLOAT32 &&
-          out->dtype() == phi::DataType::INT8) {
-        VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT32 && "
-                   "out->dtype() == phi::DataType::INT8";
-        auto branch_data_temp =
-            RAII_GUARD.alloc_l3_or_gm<int8_t>(branch_tensor->numel());
-        int r = xpu::quantization<float, int8_t>(
-            ctx.x_context(),
-            reinterpret_cast<const float*>(branch_tensor->data<float>()),
-            branch_data_temp,
-            branch_tensor->numel(),
-            branch_max_data);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
-        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
-      } else if (branch_tensor->dtype() == phi::DataType::FLOAT16 &&
-                 out->dtype() == phi::DataType::INT8) {
-        VLOG(1) << "branch_tensor->dtype() == phi::DataType::FLOAT16 && "
-                   "out->dtype() == phi::DataType::INT8";
-        auto branch_data_temp =
-            RAII_GUARD.alloc_l3_or_gm<int8_t>(branch_tensor->numel());
-        int r = xpu::quantization<XPUTypeFP16, int8_t>(
-            ctx.x_context(),
-            reinterpret_cast<const XPUTypeFP16*>(
-                branch_tensor->data<dtype::float16>()),
-            branch_data_temp,
-            branch_tensor->numel(),
-            branch_max_data);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
-        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
-      } else if (branch_tensor->dtype() == phi::DataType::INT8 &&
-                 out->dtype() == phi::DataType::FLOAT32) {
-        VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && "
-                   "out->dtype() == phi::DataType::FLOAT32";
-        // if (branch_tensor) {
-        //   DenseTensor temp_tensor_cpu;
-        //   ctx.template HostAlloc(&temp_tensor_cpu,
-        //                          branch.get_ptr()->dtype(),
-        //                          branch.get_ptr()->numel() * sizeof(int8_t));
-        //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false,
-        //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
-        //     VLOG(1) << "branch_data_quantize_before[" << i
-        //             << "]:" <<
-        //             static_cast<float>(temp_tensor_cpu.data<int8_t>()[i]);
-        //   }
-        // }
-        auto branch_data_temp =
-            RAII_GUARD.alloc_l3_or_gm<float>(branch_tensor->numel());
-        int r = xpu::dequantization<int8_t, float>(
-            ctx.x_context(),
-            reinterpret_cast<const int8_t*>(branch_tensor->data<int8_t>()),
-            branch_data_temp,
-            branch_tensor->numel(),
-            branch_max_data);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
-        // if (branch_tensor) {
-        //   DenseTensor temp_tensor_cpu;
-        //   ctx.template HostAlloc(&temp_tensor_cpu,
-        //                          phi::DataType::FLOAT32,
-        //                          branch.get_ptr()->numel() * sizeof(float));
-        //   memory_utils::Copy(CPUPlace(),
-        //              static_cast<void*>(temp_tensor_cpu.data<float>()),
-        //              ctx.GetPlace(),
-        //              static_cast<void*>(branch_data_temp),
-        //              branch.get_ptr()->numel() * sizeof(float));
-        //   for (size_t i = 0; i < 50; ++i) {
-        //     VLOG(1) << "branch_data_quantize_after[" << i
-        //             << "]:" <<
-        //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
-        //   }
-        // }
-        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
-      } else if (branch_tensor->dtype() == phi::DataType::INT8 &&
-                 out->dtype() == phi::DataType::FLOAT16) {
-        VLOG(1) << "branch_tensor->dtype() == phi::DataType::INT8 && "
-                   "out->dtype() == phi::DataType::FLOAT16";
-        // if (branch_tensor) {
-        //   DenseTensor temp_tensor_cpu;
-        //   ctx.template HostAlloc(&temp_tensor_cpu,
-        //                          branch.get_ptr()->dtype(),
-        //                          branch.get_ptr()->numel() * sizeof(int8_t));
-        //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false,
-        //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
-        //     VLOG(1) << "branch_data_quantize_before[" << i
-        //             << "]:" <<
-        //             static_cast<float>(temp_tensor_cpu.data<int8_t>()[i]);
-        //   }
-        // }
-        auto branch_data_temp =
-            RAII_GUARD.alloc_l3_or_gm<XPUTypeFP16>(branch_tensor->numel());
-        int r = xpu::dequantization<int8_t, XPUTypeFP16>(
-            ctx.x_context(),
-            reinterpret_cast<const int8_t*>(branch_tensor->data<int8_t>()),
-            branch_data_temp,
-            branch_tensor->numel(),
-            branch_max_data);
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "quantization");
-        // if (branch_tensor) {
-        //   DenseTensor temp_tensor_cpu;
-        //   ctx.template HostAlloc(&temp_tensor_cpu,
-        //                          phi::DataType::FLOAT16,
-        //                          branch.get_ptr()->numel() *
-        //                          sizeof(dtype::float16));
-        //   memory_utils::Copy(CPUPlace(),
-        //              static_cast<void*>(temp_tensor_cpu.data<dtype::float16>()),
-        //              ctx.GetPlace(),
-        //              static_cast<void*>(branch_data_temp),
-        //              branch.get_ptr()->numel() * sizeof(dtype::float16));
-        //   for (size_t i = 0; i < 50; ++i) {
-        //     VLOG(1) << "branch_data_quantize_after[" << i
-        //             << "]:" <<
-        //             static_cast<float>(temp_tensor_cpu.data<dtype::float16>()[i]);
-        //   }
-        // }
-        branch_data = reinterpret_cast<const XPUTypeOut*>(branch_data_temp);
-      } else {
-        auto branch_data_temp =
-            RAII_GUARD.alloc_l3_or_gm<XPUTypeOut>(branch_tensor->numel());
-        int r = xpu::cast<XPUTypeX, XPUTypeOut>(
-            ctx.x_context(),
-            reinterpret_cast<const XPUTypeX*>(branch_tensor->data<T_X>()),
-            branch_data_temp,
-            branch_tensor->numel());
-        PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
-        branch_data = branch_data_temp;
-      }
+      auto branch_data_temp =
+          RAII_GUARD.alloc_l3_or_gm<XPUTypeOut>(branch_tensor->numel());
+      int r = xpu::cast<XPUTypeX, XPUTypeOut>(
+          ctx.x_context(),
+          reinterpret_cast<const XPUTypeX*>(branch_tensor->data<T_X>()),
+          branch_data_temp,
+          branch_tensor->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+      branch_data = branch_data_temp;
     }
   }
-  VLOG(1) << "KERNEL3";
+
   const float* bias_data =
       bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data<float>();
   auto* out_data =
@@ -230,133 +111,13 @@ void Conv2dXPUKernelImpl(const Context& ctx,
   out_max_data = out_max_in.get_ptr() != nullptr
                      ? const_cast<float*>(out_max_in.get_ptr()->data<float>())
                      : out_max_data;
-  VLOG(1) << "KERNEL4.5";
   xpu::Activation_t act(static_cast<xpu::Activation_t::act_enum>(act_type));
-  VLOG(1) << "KERNEL5";
   if (act_type == xpu::Activation_t::LEAKY_RELU) {
     act.leaky_alpha = act_param;
   } else if (act_type == xpu::Activation_t::HARD_SIGMOID) {
     act.hard_sigmoid_slope = act_param;
   }
-  // if (input_max_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          x_max.get_ptr()->dtype(),
-  //                          x_max.get_ptr()->numel() * sizeof(float));
-  //   phi::Copy(ctx, *x_max.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) {
-  //     VLOG(1) << "input_max_data[" << i
-  //             << "]:" << temp_tensor_cpu.data<float>()[i];
-  //   }
-  // }
-
-  // if (filter_max_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          filter_max.dtype(),
-  //                          filter_max.numel() * sizeof(float));
-  //   phi::Copy(ctx, filter_max, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < temp_tensor_cpu.numel(); ++i) {
-  //     VLOG(1) << "filter_max_data[" << i
-  //             << "]:" << temp_tensor_cpu.data<float>()[i];
-  //   }
-  // }
-
-  // if (input_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, x.dtype(), x.numel() * sizeof(T_X));
-  //   phi::Copy(ctx, x, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "input_data[" << i
-  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_X>()[i]);
-  //   }
-  // }
-
-  // if (filter_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W));
-  //   phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "filter_data[" << i
-  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_W>()[i]);
-  //   }
-  // }
 
-  // if (bias_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          bias.get_ptr()->dtype(),
-  //                          bias.get_ptr()->numel() * sizeof(float));
-  //   phi::Copy(ctx, *bias.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "bias_data[" << i << "]:" <<
-  //     temp_tensor_cpu.data<float>()[i];
-  //   }
-  // }
-
-  // if (branch_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          branch.get_ptr()->dtype(),
-  //                          branch.get_ptr()->numel() * sizeof(T_OUT));
-  //   phi::Copy(ctx, *branch.get_ptr(), CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "branch_data[" << i
-  //             << "]:" <<
-  //             static_cast<float>(temp_tensor_cpu.data<T_OUT>()[i]);
-  //   }
-  // }
-
-  // if (branch_max) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          branch_max.get_ptr()->dtype(),
-  //                          branch_max.get_ptr()->numel() * sizeof(float));
-  //   phi::Copy(ctx, *branch_max.get_ptr(), CPUPlace(), false,
-  //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "branch_max_data[" << i
-  //             << "]:" << temp_tensor_cpu.data<float>()[i];
-  //   }
-  // }
-
-  // if (scale_max) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(&temp_tensor_cpu,
-  //                          scale_max.get_ptr()->dtype(),
-  //                          scale_max.get_ptr()->numel() * sizeof(float));
-  //   phi::Copy(ctx, *scale_max.get_ptr(), CPUPlace(), false,
-  //   &temp_tensor_cpu); for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "scale_max_data[" << i
-  //             << "]:" << temp_tensor_cpu.data<float>()[i];
-  //   }
-  // }
-
-  // if (filter_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, filter.dtype(), filter.numel() * sizeof(T_W));
-  //   phi::Copy(ctx, filter, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "filter_data[" << i
-  //             << "]:" << static_cast<float>(temp_tensor_cpu.data<T_W>()[i]);
-  //   }
-  // }
-
-  // if (out_max_in.get_ptr()) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, out_max_in.get_ptr()->dtype(),
-  //       out_max_in.get_ptr()->numel() * sizeof(float));
-  //   phi::Copy(ctx, *out_max_in.get_ptr(), CPUPlace(), false,
-  //   &temp_tensor_cpu); for (size_t i = 0; i < out_max_in.get_ptr()->numel();
-  //   ++i) {
-  //     VLOG(1) << "output_max_data_before[" << i
-  //             << "]:" <<
-  //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
-  //   }
-  // }
   int r = xpu::
       conv2d_fusion<XPUTypeX, XPUTypeW, XPUTypeOut, T_GEMM>(  // TX/TW/TY/TGEMM
           /* baidu::xpu::api::Context* ctx */ ctx.x_context(),
@@ -383,30 +144,6 @@ void Conv2dXPUKernelImpl(const Context& ctx,
           /* const float* branch_maxptr */ branch_max_data,
           /* const float* scale */ scale_max_data);
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "conv2d_xpu");
-  // if (out_data) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, out->dtype(), out->numel() * sizeof(T_OUT));
-  //   phi::Copy(ctx, *out, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "output_data[" << i
-  //             << "]:" <<
-  //             static_cast<float>(temp_tensor_cpu.data<T_OUT>()[i]);
-  //   }
-  // }
-
-  // if (out_max) {
-  //   DenseTensor temp_tensor_cpu;
-  //   ctx.template HostAlloc(
-  //       &temp_tensor_cpu, out_max->dtype(), out_max->numel() *
-  //       sizeof(float));
-  //   phi::Copy(ctx, *out_max, CPUPlace(), false, &temp_tensor_cpu);
-  //   for (size_t i = 0; i < 50; ++i) {
-  //     VLOG(1) << "output_max_data_after[" << i
-  //             << "]:" <<
-  //             static_cast<float>(temp_tensor_cpu.data<float>()[i]);
-  //   }
-  // }
 }
 
 #define CONV2D_XPU_KERNEL_IMPL(x_dtype_, w_dtype_, out_dtype_, gemm_dtype_)  \
@@ -453,7 +190,7 @@ void Conv2dXPUKernel(const Context& ctx,
                      DenseTensor* out,
                      DenseTensor* out_max) {
   // Dont use template T param
-  VLOG(1) << "Kernel type: " << x.dtype() << "," << filter.dtype() << " ,"
+  VLOG(4) << "Conv kernel type: " << x.dtype() << " ," << filter.dtype() << " ,"
           << out_dtype;
   if (x.dtype() == DataType::FLOAT32) {
     // float32/float16 kernel
diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
index eeb36a86eeec7..d6153eff096cb 100644
--- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
+++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc
@@ -133,7 +133,7 @@ void FcXPUKernel(const Context& ctx,
                  DenseTensor* out,
                  DenseTensor* out_max) {
   // Dont use template T param
-  VLOG(1) << "Kernel type: " << x.dtype() << " ," << w.dtype() << " ,"
+  VLOG(4) << "Fc kernel type: " << x.dtype() << " ," << w.dtype() << " ,"
           << out_dtype;
   if (x.dtype() == DataType::FLOAT32) {
     // float32/float16 kernel

From 11da53f712e5c3778e4686ee06fb533be12fc64c Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Mon, 23 Oct 2023 10:31:57 +0800
Subject: [PATCH 07/15] code style update

---
 .../framework/ir/auto_mixed_precision_pass.cc |  2 +-
 .../ir/delete_quant_dequant_linear_op_pass.cc |  2 --
 .../ir/quantize_related_pass_utils.h          | 33 -----------------
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  |  8 ++---
 paddle/fluid/framework/ir/xpu/quant_utils.cc  | 35 -------------------
 paddle/fluid/framework/ir/xpu/quant_utils.h   |  2 --
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  | 32 ++++++++---------
 .../ir/xpu/xpu_quantize_squash_pass.cc        |  8 ++---
 .../ir/xpu/xpu_quantize_squash_pass.h         | 31 ++--------------
 .../inference/api/paddle_pass_builder.cc      |  1 -
 10 files changed, 23 insertions(+), 131 deletions(-)

diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
index fe5ec348bf707..dc93b003383f4 100644
--- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -771,7 +771,7 @@ void AutoMixedPrecisionPass::SetVarPrecision() const {
                                ->GetMutable<phi::DenseTensor>();
             if (framework::TransToProtoVarType(tensor->type()) !=
                 real_in_var_node->Var()->GetDataType()) {
-              VLOG(1) << "[AutoMixedPrecisionPass] variable "
+              VLOG(3) << "[AutoMixedPrecisionPass] variable "
                       << real_in_var_node->Name() << "'s proto data type "
                       << real_in_var_node->Var()->GetDataType()
                       << " is different from real dense tensor "
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 0a9fc07a7cb07..dad7a1c49c194 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -91,7 +91,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
 
   GraphPatternDetector gpd;
   auto* scope = param_scope();
-  BlockDesc* block = nullptr;
   PADDLE_ENFORCE_NOT_NULL(
       scope,
       platform::errors::InvalidArgument(
@@ -114,7 +113,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
     */
-    block = quantize_linear_op->Op()->Block();
     std::unordered_set<const Node*> nodes2rm = {};
 
     // Get input scale from tensor
diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
index f8c6358dfcdb1..86f2160d31bc4 100644
--- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h
+++ b/paddle/fluid/framework/ir/quantize_related_pass_utils.h
@@ -28,61 +28,28 @@ static inline void SaveQuantInfoInTheGraph(
     const std::string& flag,
     const std::string& key_suffix,
     const std::unordered_map<std::string, std::vector<float>>& info_map) {
-  VLOG(1) << "Save quant info in the graph!";
   const std::string suffix = "_" + key_suffix + "_" + flag;
   if (!graph->Has(flag)) {
     graph->Set(flag, new bool(true));
   }
   for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
-    VLOG(1) << "SaveQuantInfoInTheGraph set attr: " << iter->first + suffix;
     graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
   }
 }
 
-static inline void GetQuantInfoFromTheGraph(
-    ir::Graph* graph,
-    const std::string& flag,
-    const std::string& key_suffix,
-    std::unordered_map<std::string, std::vector<float>>* info_map) {
-  VLOG(1) << "Get quant info from the graph attrs!";
-  const std::string suffix = "_" + key_suffix + "_" + flag;
-  VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0);
-  if (graph->Has(flag)) {
-    std::vector<std::string> attr_names = graph->AttrNames();
-    VLOG(1) << "attr_names size:" << attr_names.size();
-    for (auto fake_name : attr_names) {
-      VLOG(1) << "fake_name:" << fake_name;
-      size_t pos = fake_name.find(suffix);
-      if (pos != std::string::npos) {
-        std::string name = fake_name.substr(0, pos);
-        VLOG(1) << "name:" << name;
-        auto scales_vector = graph->Get<std::vector<float>>(fake_name);
-        VLOG(1) << "scales_vector:" << scales_vector[0];
-        info_map->insert(std::make_pair(name, scales_vector));
-      }
-    }
-  }
-}
-
 static inline std::unordered_map<std::string, std::vector<float>>
 GetQuantInfoFromTheGraph(ir::Graph* graph,
                          const std::string& flag,
                          const std::string& key_suffix) {
   std::unordered_map<std::string, std::vector<float>> info_map;
-  VLOG(1) << "Get quant info from the graph attrs!";
   const std::string suffix = "_" + key_suffix + "_" + flag;
-  VLOG(1) << "flag:" << (graph->Has(flag) ? 1 : 0);
   if (graph->Has(flag)) {
     std::vector<std::string> attr_names = graph->AttrNames();
-    VLOG(1) << "attr_names size:" << attr_names.size();
     for (auto fake_name : attr_names) {
-      VLOG(1) << "fake_name:" << fake_name;
       size_t pos = fake_name.find(suffix);
       if (pos != std::string::npos) {
         std::string name = fake_name.substr(0, pos);
-        VLOG(1) << "name:" << name;
         auto scales_vector = graph->Get<std::vector<float>>(fake_name);
-        VLOG(1) << "scales_vector:" << scales_vector[0];
         info_map.insert(std::make_pair(name, scales_vector));
       }
     }
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 2ce255e81707a..6fb76c5dbe457 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -780,9 +780,7 @@ void Conv2dXPUFusePass::CreateFusionInputs(
     float input_scale = GetScaleValueForNode(var_quant_scales, input);
     int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
     VarDesc conv_input_max_desc(conv_input_max_name);
-    conv_input_max_desc.SetPersistable(
-        true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
-                // device
+    conv_input_max_desc.SetPersistable(true);
     conv_input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
     conv_input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
     conv2d_xpu_input_max = graph->CreateVarNode(&conv_input_max_desc);
@@ -828,9 +826,7 @@ void Conv2dXPUFusePass::CreateFusionBranch(
     if (op_weights_precision == "int8" && !ew_branch_add_max) {
       int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
       VarDesc ew_branch_add_in_max_desc(ew_branch_add_max_name);
-      ew_branch_add_in_max_desc.SetPersistable(
-          true);  // Need depends on ir_params_sync_among_devices_pass copy to
-                  // xpu device
+      ew_branch_add_in_max_desc.SetPersistable(true);
       ew_branch_add_in_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
       ew_branch_add_in_max_desc.SetDataType(
           proto::VarType::Type::VarType_Type_FP32);
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc
index 90ca41f72958e..08c1da2148687 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc
@@ -148,41 +148,6 @@ void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out) {
   }
 }
 
-void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out) {
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-
-  paddle::experimental::CheckAndTrans2Contiguous(in);
-
-  phi::DenseTensor int8_tensor;
-  phi::DenseTensor* out_ptr = out == nullptr ? &int8_tensor : out;
-  out_ptr->Resize(in->dims());
-  out_ptr->set_type(phi::DataType::INT8);
-  out_ptr->set_layout(in->layout());
-
-  switch (in->dtype()) {
-    case phi::DataType::FLOAT32:
-      phi::CastKernel<float>(*cpu_ctx, *in, phi::DataType::INT8, out_ptr);
-      break;
-    case phi::DataType::INT8:
-      if (out == nullptr) {
-        return;
-      } else {
-        phi::AssignKernel(*cpu_ctx, *in, out_ptr);
-      }
-      break;
-    default:
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Only support fp32, but received dtype is %s.",
-          phi::DataTypeToString(in->dtype())));
-      break;
-  }
-
-  if (out == nullptr) {
-    Assign(*out_ptr, in);
-  }
-}
-
 static float FindMaxAbs(const float* data, int len) {
   float max_f = 0.0f;
   for (int i = 0; i < len; ++i) {
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h
index 30f73023b632d..b564bcac7202d 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.h
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.h
@@ -25,8 +25,6 @@ void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
 void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
-void CastToInt8(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
-
 void CastToInt32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr);
 
 template <typename T>
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index 49852ec26311d..dc151a12ee2fb 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -65,15 +65,13 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
   quantize_out_node->Var()->SetDataType(
       proto::VarType::Type::VarType_Type_INT8);
-  // Create quantize max_ptr node
 
+  // Create quantize max_ptr node
   float scale = GetScaleValueForNode(&var_quant_scales_, input);
   int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
   std::string input_max_name = input->Name() + "_quantize_max";
   VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(
-      true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
-              // device
+  input_max_desc.SetPersistable(true);
   input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
   input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
   Node* input_max_node = g->CreateVarNode(&input_max_desc);
@@ -88,7 +86,7 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
          input_scales.data(),
          max_ptr_size * sizeof(float));
 
-  // create a quantize op node
+  // Create a quantize op node
   OpDesc q_desc;
   q_desc.SetType("quantize_xpu");
   q_desc.SetInput("x", std::vector<std::string>({input->Name()}));
@@ -97,12 +95,13 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   q_desc.SetAttr("out_dtype",
                  static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
   q_desc.SetAttr("scale", static_cast<float>(scale));
-
   auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
-  // update op's input
+
+  // Update op's input
   op->Op()->SetInput(input_arg_name,
                      std::vector<std::string>({quantize_out_node->Name()}));
-  // link quantize op
+
+  // Link quantize op
   UnlinkNodes(input, op);
   IR_NODE_LINK_TO(input, quantize_op);
   IR_NODE_LINK_TO(input_max_node, quantize_op);
@@ -137,9 +136,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
   std::string input_max_name = output->Name() + "_dequantize_max";
   VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(
-      true);  // Need depends on ir_params_sync_among_devices_pass copy to xpu
-              // device
+  input_max_desc.SetPersistable(true);
   input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
   input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
   Node* input_max_node = g->CreateVarNode(&input_max_desc);
@@ -154,7 +151,7 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
          input_scales.data(),
          max_ptr_size * sizeof(float));
 
-  // create a quantize op node
+  // Create a quantize op node
   OpDesc deq_desc;
   deq_desc.SetType("dequantize_xpu");
   deq_desc.SetInput("x",
@@ -163,12 +160,13 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   deq_desc.SetOutput("y", std::vector<std::string>({output->Name()}));
   deq_desc.SetAttr("out_dtype", static_cast<int>(output->Var()->GetDataType()));
   deq_desc.SetAttr("scale", static_cast<float>(scale));
-
   auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
-  // update op's input
+
+  // Update op's input
   op->Op()->SetOutput(output_arg_name,
                       std::vector<std::string>({dequantize_in_node->Name()}));
-  // link dequantize op
+
+  // Link dequantize op
   UnlinkNodes(op, output);
   IR_NODE_LINK_TO(op, dequantize_in_node);
   IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
@@ -220,8 +218,8 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
           AreScalesPresentForNodes(&var_quant_scales_, {out_var_node});
       bool has_branch = branch_var_node != nullptr;
 
-      // Note: Conv2d fusion requres branch datatype is same as output datatype,
-      // so we should consider branch/output together.
+      // Note: Conv2d fusion requires branch datatype is same as output
+      // datatype, so we should consider branch/output together.
       if (has_branch) {
         bool has_branch_scale =
             AreScalesPresentForNodes(&var_quant_scales_, {branch_var_node});
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
index 3f25be65b3c70..7b1658d8d13aa 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -91,10 +91,8 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
 
     int equal = dequant_scale == quant_scale ? 1 : 0;
-    if (dequant_scale == quant_scale || isnan(dequant_scale) ||
-        isnan(quant_scale) || isinf(dequant_scale) || isinf(quant_scale)) {
+    if (dequant_scale == quant_scale) {
       // squash dequantize-quantize to nothing
-
       auto quant_out_var_name = quant_out->Name();
       for (auto input_name : next_op_desc->InputNames()) {
         auto& input_names = next_op_desc->MutableInputs()->at(input_name);
@@ -131,7 +129,6 @@ void XPUQuantizeSquashPass::OpDequantSquash(Graph* graph) const {
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
     VLOG(4) << "squash op-dequant ops pair";
-
     GET_IR_NODE_FROM_SUBGRAPH(any_op, any_op, op_dequant_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, op_dequant_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, op_dequant_pattern);
@@ -275,7 +272,8 @@ void XPUQuantizeSquashPass::ApplyImpl(ir::Graph* graph) const {
   FindNodesToKeep(graph, &nodes_keep_counter);
   DequantQuantSquash(graph, &nodes_keep_counter);
   OpDequantSquash(graph);
-  // QuantOpSquash(graph);
+  // QuantOpSquash(graph); // If the quant op is fused into conv2d_xpu, the
+  // performance will become worse.
   MultipleQuantizeSquash(graph);
 }
 
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
index fbfa967791304..2d3fbb94f140e 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
@@ -44,34 +44,14 @@ class XPUQuantizeSquashPass : public FusePassBase {
       std::unordered_map<const Node*, int>* nodes_keep_counter) const;
 
   /*
-   * Don't squash unsigned dequantize with signed quantize.
-   * This is important for concat and elementwise ops.
-   * When inputs have different sign, concat will assume signed type and
-   * elementwise assumes first input type.
-   */
-  bool IsDequantizeQuantizeIncompatible(Node* quant_op,
-                                        Node* dequant_op,
-                                        Node* next_op) const;
-
-  /*
-   * Squash dequantize-quantize ops pairs into requantize or nothing
+   * Squash dequantize-quantize ops pairs into nothing
    */
   void DequantQuantSquash(
       Graph* graph,
       std::unordered_map<const Node*, int>* nodes_keep_counter) const;
 
   /*
-   * Squash requantize op into conv with scale_out like requantize scale_out
-   */
-  void OpRequantSquash(Graph* graph) const;
-
-  /*
-   * Squash requantize op if the next operator's input scale can be updated
-   */
-  void RequantOpSquash(Graph* graph) const;
-
-  /*
-   * Squash dequant if the previous operator has force_fp32_output attribute
+   * Squash dequant if the previous operator support fp32 out
    */
   void OpDequantSquash(Graph* graph) const;
 
@@ -90,13 +70,6 @@ class XPUQuantizeSquashPass : public FusePassBase {
    */
   void ScaleQuantSquash(Graph* graph) const;
 
-  /*
-   * Squash quantize if is before bfloat16 conv2d or fused_conv2d
-   */
-  void QuantizeBf16Conv(Graph* graph) const;
-
-  void QuantizeBf16ConvImpl(Graph* graph, const std::string& conv_type) const;
-
   /*
    * Squash quantize if is before conv2d_xpu/fc_xpuy
    */
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 41d2ccd67b43a..25c2e0988c419 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -565,7 +565,6 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "cast_mixed_precision_op_fuse_pass",
       "xpu_quantize_op_pass",
       "xpu_quantize_squash_pass",
-      // "auto_trans_quantize_op_precision_pass",
       "delete_isolated_node_pass",
       "inplace_op_var_pass",
   });

From 748bb9dd3affebd0042d14e654c4603938260d9e Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Mon, 23 Oct 2023 12:40:38 +0800
Subject: [PATCH 08/15] update quantize/dequantize op yaml

---
 .../ir/xpu/xpu_graph_pattern_detector.cc      | 16 ++-----
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  | 44 +------------------
 .../ir/xpu/xpu_quantize_squash_pass.cc        |  2 -
 paddle/phi/api/yaml/ops.yaml                  |  6 +--
 paddle/phi/infermeta/binary.cc                | 20 ---------
 paddle/phi/infermeta/binary.h                 | 12 -----
 paddle/phi/infermeta/unary.cc                 | 18 ++++++++
 paddle/phi/infermeta/unary.h                  | 10 +++++
 .../phi/kernels/xpu/dequantization_kernel.cc  | 16 ++++---
 paddle/phi/kernels/xpu/quantization_kernel.cc | 16 ++++---
 10 files changed, 52 insertions(+), 108 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
index f74f9c8289d65..f1d2752321aad 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_graph_pattern_detector.cc
@@ -57,9 +57,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
   auto *dequant_in = pattern->NewNode(dequant_in_repr())
                          ->AsInput()
                          ->assert_is_op_input("dequantize_xpu", "x");
-  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
-                             ->AsInput()
-                             ->assert_is_op_input("dequantize_xpu", "max");
 
   auto *dequant_op =
       pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
@@ -68,9 +65,6 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
                           ->AsOutput()
                           ->assert_is_op_output("dequantize_xpu", "y");
 
-  auto *quant_max_in = pattern->NewNode(quant_max_in_repr())
-                           ->assert_is_op_input("quantize_xpu", "max");
-
   auto *quant_op = pattern->NewNode(quant_op_repr())
                        ->assert_is_op("quantize_xpu")
                        ->AsIntermediate();
@@ -81,8 +75,8 @@ PDNode *patterns::DequantQuantXPUAny::operator()() {
 
   auto *next_op = pattern->NewNode(next_op_repr())->assert_is_op();
 
-  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
-  quant_op->LinksFrom({dequant_out, quant_max_in}).LinksTo({quant_out});
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
+  quant_op->LinksFrom({dequant_out}).LinksTo({quant_out});
   next_op->LinksFrom({quant_out});
 
   return quant_out;
@@ -92,10 +86,6 @@ PDNode *patterns::OpDequantXPU::operator()() {
   auto any_op = pattern->NewNode(any_op_repr())->assert_is_op();
   auto *dequant_in = pattern->NewNode(dequant_in_repr())
                          ->assert_is_op_input("dequantize_xpu", "x");
-
-  auto *dequant_max_in = pattern->NewNode(dequant_max_in_repr())
-                             ->AsInput()
-                             ->assert_is_op_input("dequantize_xpu", "max");
   auto *dequant_op =
       pattern->NewNode(dequant_op_repr())->assert_is_op("dequantize_xpu");
   auto dequant_out = pattern->NewNode(dequant_out_repr())
@@ -103,7 +93,7 @@ PDNode *patterns::OpDequantXPU::operator()() {
                          ->assert_is_op_output("dequantize_xpu", "y");
 
   any_op->LinksTo({dequant_in});
-  dequant_op->LinksFrom({dequant_in, dequant_max_in}).LinksTo({dequant_out});
+  dequant_op->LinksFrom({dequant_in}).LinksTo({dequant_out});
   return dequant_out;
 }
 
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index dc151a12ee2fb..ebeb75763320e 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -66,31 +66,11 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   quantize_out_node->Var()->SetDataType(
       proto::VarType::Type::VarType_Type_INT8);
 
-  // Create quantize max_ptr node
-  float scale = GetScaleValueForNode(&var_quant_scales_, input);
-  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
-  std::string input_max_name = input->Name() + "_quantize_max";
-  VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(true);
-  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
-  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-  Node* input_max_node = g->CreateVarNode(&input_max_desc);
-  auto input_max_tensor =
-      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
-  input_max_tensor->set_type(phi::DataType::FLOAT32);
-  input_max_tensor->Resize({max_ptr_size});
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-  std::vector<float> input_scales(max_ptr_size, scale);
-  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
-         input_scales.data(),
-         max_ptr_size * sizeof(float));
-
   // Create a quantize op node
+  float scale = GetScaleValueForNode(&var_quant_scales_, input);
   OpDesc q_desc;
   q_desc.SetType("quantize_xpu");
   q_desc.SetInput("x", std::vector<std::string>({input->Name()}));
-  q_desc.SetInput("max", std::vector<std::string>({input_max_name}));
   q_desc.SetOutput("y", std::vector<std::string>({quantize_out_node->Name()}));
   q_desc.SetAttr("out_dtype",
                  static_cast<int>(proto::VarType::Type::VarType_Type_INT8));
@@ -104,7 +84,6 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
   // Link quantize op
   UnlinkNodes(input, op);
   IR_NODE_LINK_TO(input, quantize_op);
-  IR_NODE_LINK_TO(input_max_node, quantize_op);
   IR_NODE_LINK_TO(quantize_op, quantize_out_node);
   IR_NODE_LINK_TO(quantize_out_node, op);
 }
@@ -131,32 +110,12 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   dequantize_in_node->Var()->SetDataType(
       proto::VarType::Type::VarType_Type_INT8);
 
-  // Create dequantize max_ptr node
   float scale = GetScaleValueForNode(&var_quant_scales_, output);
-  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
-  std::string input_max_name = output->Name() + "_dequantize_max";
-  VarDesc input_max_desc(input_max_name);
-  input_max_desc.SetPersistable(true);
-  input_max_desc.SetShape({static_cast<int64_t>(max_ptr_size)});
-  input_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-  Node* input_max_node = g->CreateVarNode(&input_max_desc);
-  auto input_max_tensor =
-      scope->Var(input_max_name)->GetMutable<phi::DenseTensor>();
-  input_max_tensor->set_type(phi::DataType::FLOAT32);
-  input_max_tensor->Resize({max_ptr_size});
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-  std::vector<float> input_scales(max_ptr_size, scale);
-  memcpy(cpu_ctx->Alloc<float>(input_max_tensor),
-         input_scales.data(),
-         max_ptr_size * sizeof(float));
-
   // Create a quantize op node
   OpDesc deq_desc;
   deq_desc.SetType("dequantize_xpu");
   deq_desc.SetInput("x",
                     std::vector<std::string>({dequantize_in_node->Name()}));
-  deq_desc.SetInput("max", std::vector<std::string>({input_max_name}));
   deq_desc.SetOutput("y", std::vector<std::string>({output->Name()}));
   deq_desc.SetAttr("out_dtype", static_cast<int>(output->Var()->GetDataType()));
   deq_desc.SetAttr("scale", static_cast<float>(scale));
@@ -170,7 +129,6 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
   UnlinkNodes(op, output);
   IR_NODE_LINK_TO(op, dequantize_in_node);
   IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
-  IR_NODE_LINK_TO(input_max_node, dequantize_op);
   IR_NODE_LINK_TO(dequantize_op, output);
 }
 
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
index 7b1658d8d13aa..0e6fd9797c177 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -59,7 +59,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
     Graph* graph,
     std::unordered_map<const Node*, int>* nodes_keep_counter) const {
   GraphPatternDetector gpd;
-  LOG(INFO) << "DequantQuantSquash COME IN";
   patterns::DequantQuantXPUAny squash_pattern{gpd.mutable_pattern(),
                                               "dequant_quant_xpu_any"};
   squash_pattern();
@@ -90,7 +89,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
     // check if dequantize op should be kept or removed, decrease the counter
     bool keep_dequant = (*nodes_keep_counter)[dequant_out]-- > 1;
 
-    int equal = dequant_scale == quant_scale ? 1 : 0;
     if (dequant_scale == quant_scale) {
       // squash dequantize-quantize to nothing
       auto quant_out_var_name = quant_out->Name();
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index e41773a39c0dd..33b00e58ce841 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -658,14 +658,13 @@
   backward : depthwise_conv2d_grad
 
 - op : dequantize_xpu
-  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
   output : Tensor(y)
   infer_meta :
     func : DeQuantizeXPUInferMeta
   kernel :
     func : dequantize_xpu
     data_type: x
-  optional : max
 
 - op : det
   args : (Tensor x)
@@ -2050,14 +2049,13 @@
   backward : qr_grad
 
 - op : quantize_xpu
-  args : (Tensor x, Tensor max, DataType out_dtype, float scale = 1.0f)
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
   output : Tensor(y)
   infer_meta :
     func : QuantizeXPUInferMeta
   kernel :
     func : quantize_xpu
     data_type : x
-  optional : max
 
 - op : real
   args : (Tensor x)
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index b5599c454fdcc..2aa8543eb82c3 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -978,16 +978,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                 config);
 }
 
-void DeQuantizeXPUInferMeta(const MetaTensor& x,
-                            const MetaTensor& max,
-                            DataType out_dtype,
-                            float scale,
-                            MetaTensor* y) {
-  auto x_dims = x.dims();
-  y->set_dims(x_dims);
-  y->set_dtype(out_dtype);
-}
-
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -2607,16 +2597,6 @@ void PriorBoxInferMeta(const MetaTensor& input,
   var->set_dims(phi::make_ddim(dim_vec));
 }
 
-void QuantizeXPUInferMeta(const MetaTensor& x,
-                          const MetaTensor& max,
-                          DataType out_dtype,
-                          float scale,
-                          MetaTensor* y) {
-  auto x_dims = x.dims();
-  y->set_dims(x_dims);
-  y->set_dtype(out_dtype);
-}
-
 void RepeatInterleaveWithTensorIndexInferMeta(const MetaTensor& x,
                                               const MetaTensor& repeats,
                                               int dim,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 88e4356b105c4..153a8d553ceb5 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -155,12 +155,6 @@ void DepthwiseConvInferMeta(const MetaTensor& input,
                             MetaTensor* out,
                             MetaConfig config = MetaConfig());
 
-void DeQuantizeXPUInferMeta(const MetaTensor& x,
-                            const MetaTensor& max,
-                            DataType out_dtype,
-                            float scale,
-                            MetaTensor* y);
-
 void DistInferMeta(const MetaTensor& x,
                    const MetaTensor& y,
                    float p,
@@ -414,12 +408,6 @@ void PriorBoxInferMeta(const MetaTensor& input,
                        MetaTensor* out,
                        MetaTensor* var);
 
-void QuantizeXPUInferMeta(const MetaTensor& x,
-                          const MetaTensor& max,
-                          DataType out_dtype,
-                          float scale,
-                          MetaTensor* y);
-
 void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            const MetaTensor& value,
                            bool out_int32,
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 243f0b232395e..e6238d0ee5be0 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -672,6 +672,15 @@ void DecodeJpegInferMeta(const MetaTensor& x,
   }
 }
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void DiagEmbedInferMeta(
     const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out) {
   auto x_dims = x.dims();
@@ -3768,6 +3777,15 @@ void FillSplitOutDims(const MetaTensor& x,
   }
 }
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y) {
+  auto x_dims = x.dims();
+  y->set_dims(x_dims);
+  y->set_dtype(out_dtype);
+}
+
 void SplitInferMeta(const MetaTensor& x,
                     const IntArray& sections,
                     const Scalar& axis,
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index d79b53a71097e..8a28d454e42f7 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -145,6 +145,11 @@ void DecodeJpegInferMeta(const MetaTensor& x,
                          const std::string& mode,
                          MetaTensor* out);
 
+void DeQuantizeXPUInferMeta(const MetaTensor& x,
+                            DataType out_dtype,
+                            float scale,
+                            MetaTensor* y);
+
 void DiagEmbedInferMeta(
     const MetaTensor& x, int offset, int dim1, int dim2, MetaTensor* out);
 
@@ -453,6 +458,11 @@ void QrInferMeta(const MetaTensor& x,
                  MetaTensor* q,
                  MetaTensor* r);
 
+void QuantizeXPUInferMeta(const MetaTensor& x,
+                          DataType out_dtype,
+                          float scale,
+                          MetaTensor* y);
+
 void WeightQuantizeInferMeta(const MetaTensor& x,
                              const std::string& algo,
                              MetaTensor* out,
diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc
index 20423c1eb8920..759a3fd020458 100644
--- a/paddle/phi/kernels/xpu/dequantization_kernel.cc
+++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc
@@ -19,7 +19,7 @@ namespace phi {
 template <typename TX, typename TY, typename Context>
 void DeQuantizeKernelImpl(const Context& ctx,
                           const DenseTensor& x,
-                          const paddle::optional<DenseTensor>& max,
+                          float scale,
                           DenseTensor* y) {
   using XPUInX = typename XPUTypeTrait<TX>::Type;
   using XPUOutY = typename XPUTypeTrait<TY>::Type;
@@ -27,9 +27,12 @@ void DeQuantizeKernelImpl(const Context& ctx,
   auto* y_data = ctx.template Alloc<TY>(y);
   const auto* x_data = x.data<TX>();
   int64_t len = x.numel();
-  const float* max_data =
-      max.get_ptr() == nullptr ? nullptr : max->data<float>();
-  int r = xpu::dequantization<XPUInX, XPUOutY>(
+  int max_ptr_size = ctx.x_context()->max_ptr_size();
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+  auto max_data = RAII_GUARD.alloc_l3_or_gm<float>(max_ptr_size);
+  int r = xpu::constant<float>(ctx.x_context(), max_data, max_ptr_size, scale);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::dequantization<XPUInX, XPUOutY>(
       ctx.x_context(),
       reinterpret_cast<const XPUInX*>(x_data),
       reinterpret_cast<XPUOutY*>(y_data),
@@ -41,16 +44,15 @@ void DeQuantizeKernelImpl(const Context& ctx,
 template <typename T, typename Context>
 void DeQuantizeKernel(const Context& ctx,
                       const DenseTensor& x,
-                      const paddle::optional<DenseTensor>& max,
                       DataType out_dtype,
                       float scale,
                       DenseTensor* y) {
   switch (out_dtype) {
     case DataType::FLOAT32:
-      DeQuantizeKernelImpl<T, float, Context>(ctx, x, max, y);
+      DeQuantizeKernelImpl<T, float, Context>(ctx, x, scale, y);
       break;
     case DataType::FLOAT16:
-      DeQuantizeKernelImpl<T, dtype::float16, Context>(ctx, x, max, y);
+      DeQuantizeKernelImpl<T, dtype::float16, Context>(ctx, x, scale, y);
       break;
     default:
       PADDLE_THROW(phi::errors::Unavailable(
diff --git a/paddle/phi/kernels/xpu/quantization_kernel.cc b/paddle/phi/kernels/xpu/quantization_kernel.cc
index 01f6ddad93aa0..32b28b034e2da 100644
--- a/paddle/phi/kernels/xpu/quantization_kernel.cc
+++ b/paddle/phi/kernels/xpu/quantization_kernel.cc
@@ -19,7 +19,7 @@ namespace phi {
 template <typename TX, typename TY, typename Context>
 void QuantizeKernelImpl(const Context& ctx,
                         const DenseTensor& x,
-                        const paddle::optional<DenseTensor>& max,
+                        float scale,
                         DenseTensor* y) {
   using XPUInX = typename XPUTypeTrait<TX>::Type;
   using XPUOutY = typename XPUTypeTrait<TY>::Type;
@@ -27,9 +27,12 @@ void QuantizeKernelImpl(const Context& ctx,
   auto* y_data = ctx.template Alloc<TY>(y);
   const auto* x_data = x.data<TX>();
   int64_t len = x.numel();
-  const float* max_data =
-      max.get_ptr() == nullptr ? nullptr : max->data<float>();
-  int r = xpu::quantization<XPUInX, XPUOutY>(
+  int max_ptr_size = ctx.x_context()->max_ptr_size();
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+  auto max_data = RAII_GUARD.alloc_l3_or_gm<float>(max_ptr_size);
+  int r = xpu::constant<float>(ctx.x_context(), max_data, max_ptr_size, scale);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+  r = xpu::quantization<XPUInX, XPUOutY>(
       ctx.x_context(),
       reinterpret_cast<const XPUInX*>(x_data),
       reinterpret_cast<XPUOutY*>(y_data),
@@ -41,16 +44,15 @@ void QuantizeKernelImpl(const Context& ctx,
 template <typename T, typename Context>
 void QuantizeKernel(const Context& ctx,
                     const DenseTensor& x,
-                    const paddle::optional<DenseTensor>& max,
                     DataType out_dtype,
                     float scale,
                     DenseTensor* y) {
   switch (out_dtype) {
     case DataType::INT16:
-      QuantizeKernelImpl<T, int16_t, Context>(ctx, x, max, y);
+      QuantizeKernelImpl<T, int16_t, Context>(ctx, x, scale, y);
       break;
     case DataType::INT8:
-      QuantizeKernelImpl<T, int8_t, Context>(ctx, x, max, y);
+      QuantizeKernelImpl<T, int8_t, Context>(ctx, x, scale, y);
       break;
     default:
       PADDLE_THROW(phi::errors::Unavailable(

From 5fea223f9060300a2741bc66575f3d7a95e9c40e Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Mon, 23 Oct 2023 14:32:00 +0800
Subject: [PATCH 09/15] fix code style

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  7 ++-
 .../ir/delete_quant_dequant_linear_op_pass.cc |  2 +-
 .../delete_weight_dequant_linear_op_pass.cc   |  2 +-
 ...d_pass_utils.h => quantize_pass_helper.cc} | 21 +++-----
 .../fluid/framework/ir/quantize_pass_helper.h | 49 +++++++++++++++++++
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 20 ++++----
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 17 +------
 .../framework/ir/xpu/link_xpu_op_max_pass.cc  |  2 -
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  |  2 +-
 .../ir/xpu/xpu_quantize_squash_pass.cc        |  2 -
 .../ir/xpu/xpu_quantize_squash_pass.h         | 10 ----
 .../phi/kernels/xpu/dequantization_kernel.cc  |  2 +-
 12 files changed, 78 insertions(+), 58 deletions(-)
 rename paddle/fluid/framework/ir/{quantize_related_pass_utils.h => quantize_pass_helper.cc} (81%)
 create mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index bd9d40bde4702..47e7a9948856c 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -59,6 +59,10 @@ cc_library(
   placement_pass_base
   SRCS placement_pass_base.cc
   DEPS pass)
+cc_library(
+  quantize_pass_helper
+  SRCS quantize_pass_helper.cc
+  DEPS pass graph graph_helper)
 
 cc_library(
   coalesce_grad_tensor_pass
@@ -241,7 +245,8 @@ if(WITH_XPU)
     xpu_graph_pattern_detector
     SRCS xpu/xpu_graph_pattern_detector.cc
     DEPS graph_pattern_detector)
-  set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector)
+  set(XPU_PASS_DEPS quantize_pass_helper xpu_quant_utils xpu_pass_utils
+                    xpu_graph_pattern_detector)
   pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index c36fd3d4ff269..025cd0c2b7ddd 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -19,7 +19,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index 59f25483c110b..e30ae85f71c02 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/framework/ir/quantize_related_pass_utils.h b/paddle/fluid/framework/ir/quantize_pass_helper.cc
similarity index 81%
rename from paddle/fluid/framework/ir/quantize_related_pass_utils.h
rename to paddle/fluid/framework/ir/quantize_pass_helper.cc
index 86f2160d31bc4..730123682f58f 100644
--- a/paddle/fluid/framework/ir/quantize_related_pass_utils.h
+++ b/paddle/fluid/framework/ir/quantize_pass_helper.cc
@@ -12,18 +12,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-static inline void SaveQuantInfoInTheGraph(
+void SaveQuantInfoInTheGraph(
     ir::Graph* graph,
     const std::string& flag,
     const std::string& key_suffix,
@@ -37,10 +32,8 @@ static inline void SaveQuantInfoInTheGraph(
   }
 }
 
-static inline std::unordered_map<std::string, std::vector<float>>
-GetQuantInfoFromTheGraph(ir::Graph* graph,
-                         const std::string& flag,
-                         const std::string& key_suffix) {
+std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix) {
   std::unordered_map<std::string, std::vector<float>> info_map;
   const std::string suffix = "_" + key_suffix + "_" + flag;
   if (graph->Has(flag)) {
@@ -57,7 +50,7 @@ GetQuantInfoFromTheGraph(ir::Graph* graph,
   return info_map;
 }
 
-static inline bool AreScalesPresentForNodes(
+bool AreScalesPresentForNodes(
     std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
     std::initializer_list<Node*> nodes) {
   bool present = true;
@@ -69,13 +62,13 @@ static inline bool AreScalesPresentForNodes(
   return present;
 }
 
-static inline float GetScaleValueForNode(
+float GetScaleValueForNode(
     std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
     Node* node) {
   return var_quant_scales->at(node->Name())[0];
 }
 
-static inline std::vector<float> GetScaleVecValueForNode(
+std::vector<float> GetScaleVecValueForNode(
     std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
     Node* node) {
   return var_quant_scales->at(node->Name());
diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.h b/paddle/fluid/framework/ir/quantize_pass_helper.h
new file mode 100644
index 0000000000000..4876cd35a1cf3
--- /dev/null
+++ b/paddle/fluid/framework/ir/quantize_pass_helper.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SaveQuantInfoInTheGraph(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map);
+
+std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix);
+
+bool AreScalesPresentForNodes(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    std::initializer_list<Node*> nodes);
+
+float GetScaleValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node);
+
+std::vector<float> GetScaleVecValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 6fb76c5dbe457..09037a0fd60eb 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -515,7 +515,6 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
   }
   // Create fusion_bias_node
   auto filter_dims = filter_t->dims();
-  bool has_bias = with_bn || with_conv_bias;
   Node* fusion_bias_node = nullptr;
   if (with_conv_bias) {
     auto* ew_bias_add_y =
@@ -677,7 +676,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
         filter_ptr[i] *= scale_val_;
       }
     } else {
-      for (int i = 0; i < weight_scale.size(); i++) {
+      for (size_t i = 0; i < weight_scale.size(); i++) {
         weight_scale[i] *= scale_val_;
       }
     }
@@ -877,12 +876,12 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
       platform::errors::InvalidArgument("conv node ptr can not be null"));
   // output && output max
   std::string conv2d_xpu_out_name;
-  Node* conv2d_out_op_node = nullptr;
   Node* conv2d_out_var_node = nullptr;
 
   auto* ew_branch_add =
       GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add");
   auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
+  auto* scale = GetNodeFromNodesMap(nodes_map, "scale", "scale");
   auto* ew_bias_add =
       GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add");
   if (!act_type.empty()) {
@@ -898,7 +897,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
         act != nullptr,
         true,
         platform::errors::InvalidArgument("act node ptr can not be null"));
-    conv2d_out_op_node = act;
   } else if (ew_branch_add) {
     auto* ew_branch_add_out =
         GetNodeFromNodesMap(nodes_map, "ew_branch_add", "ew_branch_add_out");
@@ -912,7 +910,14 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
                       true,
                       platform::errors::InvalidArgument(
                           "ew_branch_add node ptr can not be null"));
-    conv2d_out_op_node = ew_branch_add;
+  } else if (scale) {
+    auto* scale_out = GetNodeFromNodesMap(nodes_map, "scale", "scale_out");
+    PADDLE_ENFORCE_EQ(scale_out != nullptr,
+                      true,
+                      platform::errors::InvalidArgument(
+                          "scale_out node ptr can not be null"));
+    conv2d_xpu_out_name = scale_out->Name();
+    conv2d_out_var_node = scale_out;
   } else if (bn) {
     auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out");
     PADDLE_ENFORCE_EQ(
@@ -921,7 +926,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
         platform::errors::InvalidArgument("bn_out node ptr can not be null"));
     conv2d_xpu_out_name = bn_out->Name();
     conv2d_out_var_node = bn_out;
-    conv2d_out_op_node = bn;
   } else if (ew_bias_add) {
     auto* ew_bias_add_out =
         GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out");
@@ -931,7 +935,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
                           "ew_bias_add_out node ptr can not be null"));
     conv2d_xpu_out_name = ew_bias_add_out->Name();
     conv2d_out_var_node = ew_bias_add_out;
-    conv2d_out_op_node = ew_bias_add;
   } else {
     auto* conv_out = GetNodeFromNodesMap(nodes_map, "conv", "conv_out");
     PADDLE_ENFORCE_EQ(
@@ -945,7 +948,6 @@ void Conv2dXPUFusePass::CreateFusionOutputs(
         conv != nullptr,
         true,
         platform::errors::InvalidArgument("conv node ptr can not be null"));
-    conv2d_out_op_node = conv;
   }
   (*fusion_nodes_map)["out"] = conv2d_out_var_node;
 
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 4e8a6d9d99c73..93ad3aec0d16a 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
@@ -381,7 +381,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
   }
   // Create fusion_bias_node
   auto filter_dims = filter_t->dims();
-  bool has_bias = with_bn || with_bias;
   Node* fusion_bias_node = nullptr;
   if (with_bias) {
     auto* ew_bias_add_bias =
@@ -390,8 +389,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
                       true,
                       platform::errors::InvalidArgument(
                           "ew_bias_add_bias node ptr can not be null"));
-    auto* ew_bias_add_bias_t = scope->FindVar(ew_bias_add_bias->Name())
-                                   ->GetMutable<phi::DenseTensor>();
     PrepareBias(graph, scope, block, ew_bias_add_bias, &fusion_bias_node);
   }
 
@@ -424,13 +421,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
 
     auto bn_bias_t =
         scope->Var(bn_bias->Name())->GetMutable<phi::DenseTensor>();
-    PADDLE_ENFORCE_EQ(
-        filter_dims[0],
-        bn_bias_t->dims()[0],
-        platform::errors::InvalidArgument("the shape[%d] of bn bias tensor "
-                                          "must equal out_channel[%d] of conv",
-                                          bn_bias_t->dims()[0],
-                                          filter_dims[0]));
     auto bn_scale_t =
         scope->Var(bn_scale->Name())->GetMutable<phi::DenseTensor>();
     auto bn_mean_t =
@@ -582,7 +572,6 @@ void FcXPUFusePass::CreateFusionOutputs(
       platform::errors::InvalidArgument("mul node ptr can not be null"));
   // output && output max
   std::string fc_xpu_out_name;
-  Node* fc_out_op_node = nullptr;
   Node* fc_out_var_node = nullptr;
 
   auto* bn = GetNodeFromNodesMap(nodes_map, "bn", "bn");
@@ -597,7 +586,6 @@ void FcXPUFusePass::CreateFusionOutputs(
         platform::errors::InvalidArgument("act_out node ptr can not be null"));
     fc_xpu_out_name = act_out->Name();
     fc_out_var_node = act_out;
-    fc_out_op_node = act;
   } else if (bn) {
     auto* bn_out = GetNodeFromNodesMap(nodes_map, "bn", "bn_out");
     PADDLE_ENFORCE_EQ(
@@ -606,7 +594,6 @@ void FcXPUFusePass::CreateFusionOutputs(
         platform::errors::InvalidArgument("bn_out node ptr can not be null"));
     fc_xpu_out_name = bn_out->Name();
     fc_out_var_node = bn_out;
-    fc_out_op_node = bn;
   } else if (ew_bias_add) {
     auto* ew_bias_add_out =
         GetNodeFromNodesMap(nodes_map, "ew_bias_add", "ew_bias_add_out");
@@ -616,7 +603,6 @@ void FcXPUFusePass::CreateFusionOutputs(
                           "ew_bias_add_out node ptr can not be null"));
     fc_xpu_out_name = ew_bias_add_out->Name();
     fc_out_var_node = ew_bias_add_out;
-    fc_out_op_node = ew_bias_add;
   } else {
     auto* mul_out = GetNodeFromNodesMap(nodes_map, "mul", "mul_out");
     PADDLE_ENFORCE_EQ(
@@ -625,7 +611,6 @@ void FcXPUFusePass::CreateFusionOutputs(
         platform::errors::InvalidArgument("mul_out node ptr can not be null"));
     fc_xpu_out_name = mul_out->Name();
     fc_out_var_node = mul_out;
-    fc_out_op_node = mul;
   }
   (*fusion_nodes_map)["out"] = fc_out_var_node;
 
diff --git a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
index bf03a2598726c..9b552bac36f2d 100644
--- a/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/link_xpu_op_max_pass.cc
@@ -172,7 +172,6 @@ void LinkXPUOpMaxPass::LinkConv2dMax(ir::Graph* graph, bool with_branch) const {
   GraphPatternDetector gpd;
   patterns::LinkConv2dPattern pattern(
       gpd.mutable_pattern(), name_scope_, with_branch);
-  auto* scope = param_scope();
   int found_subgraph_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
@@ -232,7 +231,6 @@ void LinkXPUOpMaxPass::LinkFcMax(ir::Graph* graph) const {
   GraphPatternDetector gpd;
   patterns::LinkFcPattern pattern(gpd.mutable_pattern(), name_scope_);
   int found_subgraph_count = 0;
-  auto* scope = param_scope();
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* graph) {
     VLOG(4) << "handle LinkFcMax";
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index ebeb75763320e..865464dcd7dca 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -18,7 +18,7 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/quantize_related_pass_utils.h"
+#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
index 0e6fd9797c177..6161293bf7fb7 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.cc
@@ -66,8 +66,6 @@ void XPUQuantizeSquashPass::DequantQuantSquash(
   int found_dequant_quant_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
-    LOG(INFO) << "squash dequantize-quantize ops pair";
-
     GET_IR_NODE_FROM_SUBGRAPH(dequant_in, dequant_in, squash_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(dequant_op, dequant_op, squash_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(dequant_out, dequant_out, squash_pattern);
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
index 2d3fbb94f140e..d3f37dd42010d 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_squash_pass.h
@@ -60,16 +60,6 @@ class XPUQuantizeSquashPass : public FusePassBase {
    */
   void MultipleQuantizeSquash(Graph* graph) const;
 
-  /*
-   * Squash scale if dequantize is before scale
-   */
-  void DequantScaleSquash(Graph* graph) const;
-
-  /*
-   * Squash scale if scale is before quantize
-   */
-  void ScaleQuantSquash(Graph* graph) const;
-
   /*
    * Squash quantize if is before conv2d_xpu/fc_xpuy
    */
diff --git a/paddle/phi/kernels/xpu/dequantization_kernel.cc b/paddle/phi/kernels/xpu/dequantization_kernel.cc
index 759a3fd020458..9dc9868e75fd9 100644
--- a/paddle/phi/kernels/xpu/dequantization_kernel.cc
+++ b/paddle/phi/kernels/xpu/dequantization_kernel.cc
@@ -56,7 +56,7 @@ void DeQuantizeKernel(const Context& ctx,
       break;
     default:
       PADDLE_THROW(phi::errors::Unavailable(
-          "Not supported Quantize data type from %d -> %d ",
+          "Not supported dequantize data type from %d -> %d ",
           x.dtype(),
           out_dtype));
   }

From 4fa68eba56e820e1cbebe7536bb41b1691032ba8 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 24 Oct 2023 11:13:10 +0800
Subject: [PATCH 10/15] fix link quantize_helper.cc library wrong

---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 paddle/fluid/framework/ir/CMakeLists.txt      |  9 +--
 .../ir/delete_quant_dequant_linear_op_pass.cc |  2 +-
 .../delete_weight_dequant_linear_op_pass.cc   |  2 +-
 .../framework/ir/quantize_pass_helper.cc      | 79 -------------------
 .../fluid/framework/ir/quantize_pass_helper.h | 49 ------------
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  |  2 +-
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      |  2 +-
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  |  2 +-
 9 files changed, 11 insertions(+), 140 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.cc
 delete mode 100644 paddle/fluid/framework/ir/quantize_pass_helper.h

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b83568cfdd69a..e13025182ed9d 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -55,13 +55,13 @@ function(pass_library TARGET DEST)
       ${TARGET}
       SRCS ${pass_library_DIR}/${TARGET}.cc
       DEPS graph_pattern_detector pass fuse_pass_base op_version_registry
-           ${pass_library_DEPS})
+           quantize_helper ${pass_library_DEPS})
   else()
     cc_library(
       ${TARGET}
       SRCS ${TARGET}.cc
       DEPS graph_pattern_detector pass fuse_pass_base op_version_registry
-           ${pass_library_DEPS})
+           quantize_helper ${pass_library_DEPS})
   endif()
 
   # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 47e7a9948856c..305a11805c9b0 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -60,9 +60,9 @@ cc_library(
   SRCS placement_pass_base.cc
   DEPS pass)
 cc_library(
-  quantize_pass_helper
-  SRCS quantize_pass_helper.cc
-  DEPS pass graph graph_helper)
+  quantize_helper
+  SRCS quantize_helper.cc
+  DEPS graph graph_helper)
 
 cc_library(
   coalesce_grad_tensor_pass
@@ -245,8 +245,7 @@ if(WITH_XPU)
     xpu_graph_pattern_detector
     SRCS xpu/xpu_graph_pattern_detector.cc
     DEPS graph_pattern_detector)
-  set(XPU_PASS_DEPS quantize_pass_helper xpu_quant_utils xpu_pass_utils
-                    xpu_graph_pattern_detector)
+  set(XPU_PASS_DEPS xpu_quant_utils xpu_pass_utils xpu_graph_pattern_detector)
   pass_library(cast_mixed_precision_op_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
   pass_library(yolo_box_xpu_fuse_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index 025cd0c2b7ddd..916d577d23d60 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -19,7 +19,7 @@
 #include <string>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
index e30ae85f71c02..87f2de2a59e0d 100644
--- a/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/ir/delete_weight_dequant_linear_op_pass.h"
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.cc b/paddle/fluid/framework/ir/quantize_pass_helper.cc
deleted file mode 100644
index 730123682f58f..0000000000000
--- a/paddle/fluid/framework/ir/quantize_pass_helper.cc
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SaveQuantInfoInTheGraph(
-    ir::Graph* graph,
-    const std::string& flag,
-    const std::string& key_suffix,
-    const std::unordered_map<std::string, std::vector<float>>& info_map) {
-  const std::string suffix = "_" + key_suffix + "_" + flag;
-  if (!graph->Has(flag)) {
-    graph->Set(flag, new bool(true));
-  }
-  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
-    graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
-  }
-}
-
-std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
-    ir::Graph* graph, const std::string& flag, const std::string& key_suffix) {
-  std::unordered_map<std::string, std::vector<float>> info_map;
-  const std::string suffix = "_" + key_suffix + "_" + flag;
-  if (graph->Has(flag)) {
-    std::vector<std::string> attr_names = graph->AttrNames();
-    for (auto fake_name : attr_names) {
-      size_t pos = fake_name.find(suffix);
-      if (pos != std::string::npos) {
-        std::string name = fake_name.substr(0, pos);
-        auto scales_vector = graph->Get<std::vector<float>>(fake_name);
-        info_map.insert(std::make_pair(name, scales_vector));
-      }
-    }
-  }
-  return info_map;
-}
-
-bool AreScalesPresentForNodes(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    std::initializer_list<Node*> nodes) {
-  bool present = true;
-  for (auto node : nodes) {
-    if (var_quant_scales->count(node->Name()) == 0) {
-      present = false;
-    }
-  }
-  return present;
-}
-
-float GetScaleValueForNode(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    Node* node) {
-  return var_quant_scales->at(node->Name())[0];
-}
-
-std::vector<float> GetScaleVecValueForNode(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    Node* node) {
-  return var_quant_scales->at(node->Name());
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/quantize_pass_helper.h b/paddle/fluid/framework/ir/quantize_pass_helper.h
deleted file mode 100644
index 4876cd35a1cf3..0000000000000
--- a/paddle/fluid/framework/ir/quantize_pass_helper.h
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/pass.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SaveQuantInfoInTheGraph(
-    ir::Graph* graph,
-    const std::string& flag,
-    const std::string& key_suffix,
-    const std::unordered_map<std::string, std::vector<float>>& info_map);
-
-std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
-    ir::Graph* graph, const std::string& flag, const std::string& key_suffix);
-
-bool AreScalesPresentForNodes(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    std::initializer_list<Node*> nodes);
-
-float GetScaleValueForNode(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    Node* node);
-
-std::vector<float> GetScaleVecValueForNode(
-    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
-    Node* node);
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index 09037a0fd60eb..f4484689d7994 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 93ad3aec0d16a..852bed2b20af0 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -19,7 +19,7 @@
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/pass.h"
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/fluid/framework/ir/xpu/pass_utils.h"
 #include "paddle/fluid/framework/ir/xpu/quant_utils.h"
 #include "paddle/fluid/framework/op_version_registry.h"
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index 865464dcd7dca..a00879072c30b 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -18,7 +18,7 @@
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/ir/quantize_pass_helper.h"
+#include "paddle/fluid/framework/ir/quantize_helper.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {

From cdeec3964cd6299684b46fa2e7734fb31824a325 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 24 Oct 2023 11:13:32 +0800
Subject: [PATCH 11/15] fix link quantize_helper.cc library wrong

---
 paddle/fluid/framework/ir/quantize_helper.cc | 79 ++++++++++++++++++++
 paddle/fluid/framework/ir/quantize_helper.h  | 49 ++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/quantize_helper.cc
 create mode 100644 paddle/fluid/framework/ir/quantize_helper.h

diff --git a/paddle/fluid/framework/ir/quantize_helper.cc b/paddle/fluid/framework/ir/quantize_helper.cc
new file mode 100644
index 0000000000000..08f2cc457ef2c
--- /dev/null
+++ b/paddle/fluid/framework/ir/quantize_helper.cc
@@ -0,0 +1,79 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/quantize_helper.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SaveQuantInfoInTheGraph(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map) {
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  if (!graph->Has(flag)) {
+    graph->Set(flag, new bool(true));
+  }
+  for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+    graph->Set(iter->first + suffix, new std::vector<float>(iter->second));
+  }
+}
+
+std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix) {
+  std::unordered_map<std::string, std::vector<float>> info_map;
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  if (graph->Has(flag)) {
+    std::vector<std::string> attr_names = graph->AttrNames();
+    for (auto fake_name : attr_names) {
+      size_t pos = fake_name.find(suffix);
+      if (pos != std::string::npos) {
+        std::string name = fake_name.substr(0, pos);
+        auto scales_vector = graph->Get<std::vector<float>>(fake_name);
+        info_map.insert(std::make_pair(name, scales_vector));
+      }
+    }
+  }
+  return info_map;
+}
+
+bool AreScalesPresentForNodes(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    std::initializer_list<Node*> nodes) {
+  bool present = true;
+  for (auto node : nodes) {
+    if (var_quant_scales->count(node->Name()) == 0) {
+      present = false;
+    }
+  }
+  return present;
+}
+
+float GetScaleValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node) {
+  return var_quant_scales->at(node->Name())[0];
+}
+
+std::vector<float> GetScaleVecValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node) {
+  return var_quant_scales->at(node->Name());
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/quantize_helper.h b/paddle/fluid/framework/ir/quantize_helper.h
new file mode 100644
index 0000000000000..4876cd35a1cf3
--- /dev/null
+++ b/paddle/fluid/framework/ir/quantize_helper.h
@@ -0,0 +1,49 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SaveQuantInfoInTheGraph(
+    ir::Graph* graph,
+    const std::string& flag,
+    const std::string& key_suffix,
+    const std::unordered_map<std::string, std::vector<float>>& info_map);
+
+std::unordered_map<std::string, std::vector<float>> GetQuantInfoFromTheGraph(
+    ir::Graph* graph, const std::string& flag, const std::string& key_suffix);
+
+bool AreScalesPresentForNodes(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    std::initializer_list<Node*> nodes);
+
+float GetScaleValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node);
+
+std::vector<float> GetScaleVecValueForNode(
+    std::unordered_map<std::string, std::vector<float>>* var_quant_scales,
+    Node* node);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle

From d86c4ce5df96e957d4dca62b0c57493000c59923 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 24 Oct 2023 14:58:05 +0800
Subject: [PATCH 12/15] static check fix

---
 paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc     | 1 -
 paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc | 5 ++---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 852bed2b20af0..2c516ba46851e 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -380,7 +380,6 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
     weight_scale = GetScaleVecValueForNode(var_quant_scales, mul_w);
   }
   // Create fusion_bias_node
-  auto filter_dims = filter_t->dims();
   Node* fusion_bias_node = nullptr;
   if (with_bias) {
     auto* ew_bias_add_bias =
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index a00879072c30b..a7db42f8ec951 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -49,7 +49,6 @@ void XPUQuantizeOpPass::QuantizeInput(Graph* g,
                                       Node* op,
                                       Node* input,
                                       std::string input_arg_name) const {
-  auto* scope = param_scope();
   auto inputs = op->Op()->InputNames();
   bool name_found =
       std::find(inputs.begin(), inputs.end(), input_arg_name) != inputs.end();
@@ -92,7 +91,6 @@ void XPUQuantizeOpPass::DequantizeOutput(Graph* g,
                                          Node* op,
                                          Node* output,
                                          std::string output_arg_name) const {
-  auto* scope = param_scope();
   auto outputs = op->Op()->OutputNames();
   bool name_found =
       std::find(outputs.begin(), outputs.end(), output_arg_name) !=
@@ -166,7 +164,8 @@ void XPUQuantizeOpPass::QuantizeConv(ir::Graph* graph) const {
           out_var_node = output_node;
         }
       }
-      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) {
+      if (!AreScalesPresentForNodes(&var_quant_scales_,
+                                    {x_var_node, w_var_node})) {
         MarkAndLogCannotQuantizeOp(n, "No scale available for the operator");
         return;
       }

From 9a3c5392c03459b7687985227cb0678925bb29a0 Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 24 Oct 2023 15:30:47 +0800
Subject: [PATCH 13/15] remove use mutable_data func and use data func instead

---
 .../framework/ir/xpu/conv2d_xpu_fuse_pass.cc  | 27 +++++++------------
 .../framework/ir/xpu/fc_xpu_fuse_pass.cc      | 21 +++++----------
 .../framework/ir/xpu/xpu_quantize_op_pass.cc  |  3 ++-
 3 files changed, 18 insertions(+), 33 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
index f4484689d7994..89a558c6601f1 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_xpu_fuse_pass.cc
@@ -577,14 +577,10 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
     auto bn_mean_t =
         scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
     auto bn_var_t = scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
-    float* bn_scale_ptr =
-        bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_bias_ptr =
-        bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_mean_ptr =
-        bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_var_ptr =
-        bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_scale_ptr = bn_scale_t->data<float>();
+    float* bn_bias_ptr = bn_bias_t->data<float>();
+    float* bn_mean_ptr = bn_mean_t->data<float>();
+    float* bn_var_ptr = bn_var_t->data<float>();
     auto mean_len = bn_mean_t->numel();
     auto filter_stride = filter_len / mean_len;
     float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
@@ -594,24 +590,21 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
 
     auto fusion_bias_t =
         scope->Var(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
-    float* fusion_bias_ptr =
-        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* fusion_bias_ptr = fusion_bias_t->data<float>();
     // recompute bias and weights
     for (int i = 0; i < mean_len; ++i) {
       bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
     }
     // recompute the weights
     if (op_weights_precision != "int8") {
-      float* filter_ptr =
-          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      float* filter_ptr = filter_t->data<float>();
       for (int i = 0; i < mean_len; ++i) {
         for (int j = 0; j < filter_stride; j++) {
           filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
         }
       }
     } else {
-      int8_t* filter_ptr =
-          filter_t->mutable_data<int8_t>(paddle::platform::CPUPlace());
+      int8_t* filter_ptr = filter_t->data<int8_t>();
       PADDLE_ENFORCE_EQ(
           weight_scale.size(),
           mean_len,
@@ -659,8 +652,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
     // recompute bias as scale op
     auto fusion_bias_t =
         scope->GetVar(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
-    float* fusion_bias_ptr =
-        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* fusion_bias_ptr = fusion_bias_t->data<float>();
     for (int i = 0; i < bias_len; ++i) {
       if (bias_after_scale_) {
         fusion_bias_ptr[i] = fusion_bias_ptr[i] * scale_val_ + bias_val_;
@@ -670,8 +662,7 @@ void Conv2dXPUFusePass::CreateFusionWeightsAndBias(
     }
     // recompute weight as scale op
     if (op_weights_precision != "int8") {
-      float* filter_ptr =
-          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      float* filter_ptr = filter_t->data<float>();
       for (int i = 0; i < filter_len; ++i) {
         filter_ptr[i] *= scale_val_;
       }
diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
index 2c516ba46851e..373275706700f 100644
--- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc
@@ -425,14 +425,10 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
     auto bn_mean_t =
         scope->Var(bn_mean->Name())->GetMutable<phi::DenseTensor>();
     auto bn_var_t = scope->Var(bn_var->Name())->GetMutable<phi::DenseTensor>();
-    float* bn_scale_ptr =
-        bn_scale_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_bias_ptr =
-        bn_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_mean_ptr =
-        bn_mean_t->mutable_data<float>(paddle::platform::CPUPlace());
-    float* bn_var_ptr =
-        bn_var_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* bn_scale_ptr = bn_scale_t->data<float>();
+    float* bn_bias_ptr = bn_bias_t->data<float>();
+    float* bn_mean_ptr = bn_mean_t->data<float>();
+    float* bn_var_ptr = bn_var_t->data<float>();
     auto mean_len = bn_mean_t->numel();
     auto filter_stride = filter_len / mean_len;
     float epsilon = PADDLE_GET_CONST(float, bn->Op()->GetAttr("epsilon"));
@@ -442,24 +438,21 @@ void FcXPUFusePass::CreateFusionWeightsAndBias(
 
     auto fusion_bias_t =
         scope->Var(fusion_bias_node->Name())->GetMutable<phi::DenseTensor>();
-    float* fusion_bias_ptr =
-        fusion_bias_t->mutable_data<float>(paddle::platform::CPUPlace());
+    float* fusion_bias_ptr = fusion_bias_t->data<float>();
     // recompute bias and weights
     for (int i = 0; i < mean_len; ++i) {
       bn_scale_ptr[i] = bn_scale_ptr[i] / sqrtf(bn_var_ptr[i] + epsilon);
     }
     // recompute the weights
     if (op_weights_precision != "int8") {
-      float* filter_ptr =
-          filter_t->mutable_data<float>(paddle::platform::CPUPlace());
+      float* filter_ptr = filter_t->data<float>();
       for (int i = 0; i < mean_len; ++i) {
         for (int j = 0; j < filter_stride; j++) {
           filter_ptr[i * filter_stride + j] *= bn_scale_ptr[i];
         }
       }
     } else {
-      int8_t* filter_ptr =
-          filter_t->mutable_data<int8_t>(paddle::platform::CPUPlace());
+      int8_t* filter_ptr = filter_t->data<int8_t>();
       PADDLE_ENFORCE_EQ(
           weight_scale.size(),
           mean_len,
diff --git a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
index a7db42f8ec951..761f17a92e299 100644
--- a/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/xpu_quantize_op_pass.cc
@@ -237,7 +237,8 @@ void XPUQuantizeOpPass::QuantizeFC(ir::Graph* graph) const {
           out_var_node = output_node;
         }
       }
-      if (!AreScalesPresentForNodes(&var_quant_scales_, {x_var_node})) {
+      if (!AreScalesPresentForNodes(&var_quant_scales_,
+                                    {x_var_node, w_var_node})) {
         MarkAndLogCannotQuantizeOp(n, "No scale available for the operator");
         return;
       }

From 7c9255ec96b5ad9f5267c2ad3881274279d8925f Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Tue, 24 Oct 2023 18:38:30 +0800
Subject: [PATCH 14/15] remove old prepare weight func

---
 .../ir/xpu/conv2d_transpose_xpu_fuse_pass.cc  |  10 +-
 .../xpu/fused_multi_transformer_xpu_pass.cc   |  20 +-
 .../ir/xpu/multi_encoder_xpu_fuse_pass.cc     |  24 +-
 paddle/fluid/framework/ir/xpu/pass_utils.cc   | 223 ++++++------------
 paddle/fluid/framework/ir/xpu/pass_utils.h    |  16 +-
 paddle/fluid/framework/ir/xpu/quant_utils.cc  |  51 +---
 paddle/fluid/framework/ir/xpu/quant_utils.h   |   8 -
 .../inference/analysis/passes/CMakeLists.txt  |   9 +-
 .../passes/convert_to_mixed_precision.cc      |   1 -
 9 files changed, 120 insertions(+), 242 deletions(-)

diff --git a/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc
index 784d5d4ec029f..51ebb63c563dc 100644
--- a/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/conv2d_transpose_xpu_fuse_pass.cc
@@ -377,8 +377,14 @@ int Conv2dTransposeXPUFusePass::ApplyImpl(ir::Graph* graph,
     // filter max
     Node* filter_int16 = nullptr;
     Node* filter_max = nullptr;
-    PrepareWeight<int16_t>(
-        graph, scope, block, conv_filter, &filter_int16, &filter_max, false);
+    PrepareWeight<float, int16_t>(graph,
+                                  scope,
+                                  block,
+                                  conv_filter,
+                                  &filter_int16,
+                                  &filter_max,
+                                  false,
+                                  std::vector<float>({}));
     // output && output max
     std::string conv2d_xpu_out_name;
     if (!act_type.empty()) {
diff --git a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
index 725f4e6a86a49..47bf2b06be9d9 100644
--- a/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/fused_multi_transformer_xpu_pass.cc
@@ -424,11 +424,23 @@ int FusedMultiTransformerXPUPass::FusedMultiTransformerXPUQuant(
             nullptr,
             platform::errors::Fatal("w node should not be nullptr"));
         if (quant_post_dynamic_weight_precision == 0) {
-          PrepareWeight<int8_t>(
-              graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
+          PrepareWeight<float, int8_t>(graph,
+                                       scope,
+                                       block,
+                                       w_node,
+                                       &w_intx,
+                                       &w_max,
+                                       need_transpose,
+                                       std::vector<float>({}));
         } else {
-          PrepareWeight<int16_t>(
-              graph, scope, block, w_node, &w_intx, &w_max, need_transpose);
+          PrepareWeight<float, int16_t>(graph,
+                                        scope,
+                                        block,
+                                        w_node,
+                                        &w_intx,
+                                        &w_max,
+                                        need_transpose,
+                                        std::vector<float>({}));
         }
         w_nodes->push_back(w_node);
         w_intx_nodes->push_back(w_intx);
diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
index 255c1f5d47a4c..04439608aaa23 100644
--- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc
@@ -561,7 +561,8 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight(Graph* graph,
       &q_w_fp32_t, &k_w_fp32_t, &v_w_fp32_t};
   phi::ConcatKernel<float>(*cpu_ctx, in_tensors, 0, &qkv_w_int16_t);
 
-  PrepareWeight<int16_t>(&qkv_w_int16_t, &qkv_w_max_t, false);
+  ConvertWithQuant<float, int16_t>(
+      &qkv_w_int16_t, &qkv_w_max_t, false, std::vector<float>({}));
   size_t qkv_w_int16_hash = HashTensor<int16_t>(qkv_w_int16_t);
   size_t qkv_w_max_hash = HashTensor<float>(qkv_w_max_t);
   std::string qkv_w_int16_name = std::to_string(qkv_w_int16_hash);
@@ -813,16 +814,17 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse(
                      &qkv_w_int16,
                      &qkv_w_max);
 
-#define PREPARE_QKV_MATMUL_W(idx_)                     \
-  Node* qkv_matmul_##idx_##_w_int16 = nullptr;         \
-  Node* qkv_matmul_##idx_##_w_max = nullptr;           \
-  PrepareWeight<int16_t>(graph,                        \
-                         scope,                        \
-                         block,                        \
-                         qkv_matmul_##idx_##_w,        \
-                         &qkv_matmul_##idx_##_w_int16, \
-                         &qkv_matmul_##idx_##_w_max,   \
-                         true);
+#define PREPARE_QKV_MATMUL_W(idx_)                            \
+  Node* qkv_matmul_##idx_##_w_int16 = nullptr;                \
+  Node* qkv_matmul_##idx_##_w_max = nullptr;                  \
+  PrepareWeight<float, int16_t>(graph,                        \
+                                scope,                        \
+                                block,                        \
+                                qkv_matmul_##idx_##_w,        \
+                                &qkv_matmul_##idx_##_w_int16, \
+                                &qkv_matmul_##idx_##_w_max,   \
+                                true,                         \
+                                std::vector<float>({}));
     PREPARE_QKV_MATMUL_W(1);
     PREPARE_QKV_MATMUL_W(2);
     PREPARE_QKV_MATMUL_W(3);
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.cc b/paddle/fluid/framework/ir/xpu/pass_utils.cc
index b895033108e12..c6dc291315399 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.cc
@@ -133,79 +133,78 @@ void PrepareWeight(Graph* graph,
                    Scope* scope,
                    BlockDesc* block,
                    Node* weight,
-                   Node** quant_weight,
-                   Node** quant_weight_max,
+                   Node** dst_weight,
+                   Node** dst_weight_max,
                    bool transpose,
                    const std::vector<float>& weight_scales) {
   auto weight_name = weight->Name();
   auto* weight_tensor = scope->Var(weight_name)->GetMutable<phi::DenseTensor>();
-  phi::DenseTensor quant_weight_tensor;
-  Assign(*weight_tensor, &quant_weight_tensor);
-  phi::DenseTensor quant_weight_max_tensor;
+  phi::DenseTensor dst_weight_tensor;
+  Assign(*weight_tensor, &dst_weight_tensor);
+  phi::DenseTensor dst_weight_max_tensor;
   ConvertWeightWrapper<Tcpu, Txpu>(
-      &quant_weight_tensor, &quant_weight_max_tensor, transpose, weight_scales);
-  size_t quant_weight_hash = HashTensor<Txpu>(quant_weight_tensor);
-  size_t quant_weight_max_hash = HashTensor<float>(quant_weight_max_tensor);
+      &dst_weight_tensor, &dst_weight_max_tensor, transpose, weight_scales);
+  size_t dst_weight_hash = HashTensor<Txpu>(dst_weight_tensor);
+  size_t dst_weight_max_hash = HashTensor<float>(dst_weight_max_tensor);
   std::string pre_name = GetPrefixWithoutHash(weight_name);
-  std::string quant_weight_name =
-      pre_name + "_#" + std::to_string(quant_weight_hash);
-  std::string quant_weight_max_name =
-      pre_name + "_max_#" + std::to_string(quant_weight_max_hash);
-  *quant_weight = FindNodeWithName(graph, quant_weight_name);
-  if (*quant_weight == nullptr) {
-    // Create quant_weight node
-    // Update quant_weight var_desc in block
-    VarDesc quant_weight_desc(quant_weight_name);
-    quant_weight_desc.SetPersistable(true);
-    quant_weight_desc.SetShape(vectorize(quant_weight_tensor.dims()));
-    quant_weight_desc.SetDataType(
-        framework::TransToProtoVarType(quant_weight_tensor.dtype()));
-    *quant_weight = graph->CreateVarNode(&quant_weight_desc);
-    auto* block_quant_weight_desc = block->Var(quant_weight_name);
-    block_quant_weight_desc->SetPersistable(quant_weight_desc.Persistable());
-    block_quant_weight_desc->SetShape(quant_weight_desc.GetShape());
-    block_quant_weight_desc->SetDataType(quant_weight_desc.GetDataType());
-    // Create quant_weight_max node
-    // Update quant_weight_max var_desc in block
-    VarDesc quant_weight_max_desc(quant_weight_max_name);
-    quant_weight_max_desc.SetPersistable(true);
-    quant_weight_max_desc.SetShape(vectorize(quant_weight_max_tensor.dims()));
-    quant_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-    *quant_weight_max = graph->CreateVarNode(&quant_weight_max_desc);
-    auto* block_quant_weight_max_desc = block->Var(quant_weight_max_name);
-    block_quant_weight_max_desc->SetPersistable(
-        quant_weight_max_desc.Persistable());
-    block_quant_weight_max_desc->SetShape(quant_weight_max_desc.GetShape());
-    block_quant_weight_max_desc->SetDataType(
-        quant_weight_max_desc.GetDataType());
+  std::string dst_weight_name =
+      pre_name + "_#" + std::to_string(dst_weight_hash);
+  std::string dst_weight_max_name =
+      pre_name + "_max_#" + std::to_string(dst_weight_max_hash);
+  *dst_weight = FindNodeWithName(graph, dst_weight_name);
+  if (*dst_weight == nullptr) {
+    // Create dst_weight node
+    // Update dst_weight var_desc in block
+    VarDesc dst_weight_desc(dst_weight_name);
+    dst_weight_desc.SetPersistable(true);
+    dst_weight_desc.SetShape(vectorize(dst_weight_tensor.dims()));
+    dst_weight_desc.SetDataType(
+        framework::TransToProtoVarType(dst_weight_tensor.dtype()));
+    *dst_weight = graph->CreateVarNode(&dst_weight_desc);
+    auto* block_dst_weight_desc = block->Var(dst_weight_name);
+    block_dst_weight_desc->SetPersistable(dst_weight_desc.Persistable());
+    block_dst_weight_desc->SetShape(dst_weight_desc.GetShape());
+    block_dst_weight_desc->SetDataType(dst_weight_desc.GetDataType());
+    // Create dst_weight_max node
+    // Update dst_weight_max var_desc in block
+    VarDesc dst_weight_max_desc(dst_weight_max_name);
+    dst_weight_max_desc.SetPersistable(true);
+    dst_weight_max_desc.SetShape(vectorize(dst_weight_max_tensor.dims()));
+    dst_weight_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
+    *dst_weight_max = graph->CreateVarNode(&dst_weight_max_desc);
+    auto* block_dst_weight_max_desc = block->Var(dst_weight_max_name);
+    block_dst_weight_max_desc->SetPersistable(
+        dst_weight_max_desc.Persistable());
+    block_dst_weight_max_desc->SetShape(dst_weight_max_desc.GetShape());
+    block_dst_weight_max_desc->SetDataType(dst_weight_max_desc.GetDataType());
     // Find dst/dst_max variable in scope
-    auto* quant_weight_var = scope->FindVar(quant_weight_name);
-    if (quant_weight_var == nullptr) {
-      // Create quant_weight/quant_weight_max variable/tensor
-      Assign(quant_weight_tensor,
-             scope->Var(quant_weight_name)->GetMutable<phi::DenseTensor>());
-      Assign(quant_weight_max_tensor,
-             scope->Var(quant_weight_max_name)->GetMutable<phi::DenseTensor>());
+    auto* dst_weight_var = scope->FindVar(dst_weight_name);
+    if (dst_weight_var == nullptr) {
+      // Create dst_weight/dst_weight_max variable/tensor
+      Assign(dst_weight_tensor,
+             scope->Var(dst_weight_name)->GetMutable<phi::DenseTensor>());
+      Assign(dst_weight_max_tensor,
+             scope->Var(dst_weight_max_name)->GetMutable<phi::DenseTensor>());
     } else {
       // Share the same variable
       PADDLE_ENFORCE_NOT_NULL(
-          scope->FindVar(quant_weight_max_name),
-          platform::errors::Fatal("quant_weight_max(%s) variable should not be "
-                                  "nullptr if quant_weight(%s) "
+          scope->FindVar(dst_weight_max_name),
+          platform::errors::Fatal("dst_weight_max(%s) variable should not be "
+                                  "nullptr if dst_weight(%s) "
                                   "variable is exist. (weight_name is %s)",
-                                  quant_weight_max_name,
-                                  quant_weight_name,
+                                  dst_weight_max_name,
+                                  dst_weight_name,
                                   weight_name));
     }
   } else {
-    *quant_weight_max = FindNodeWithName(graph, quant_weight_max_name);
+    *dst_weight_max = FindNodeWithName(graph, dst_weight_max_name);
     PADDLE_ENFORCE_NOT_NULL(
-        *quant_weight_max,
-        platform::errors::Fatal("quant_weight_max(%s) variable should not be "
-                                "nullptr if quant_weight(%s) "
+        *dst_weight_max,
+        platform::errors::Fatal("dst_weight_max(%s) variable should not be "
+                                "nullptr if dst_weight(%s) "
                                 "variable is exist. (weight_name is %s)",
-                                quant_weight_max_name,
-                                quant_weight_name,
+                                dst_weight_max_name,
+                                dst_weight_name,
                                 weight_name));
   }
 }
@@ -215,112 +214,30 @@ template void PrepareWeight<float, int16_t>(
     Scope* scope,
     BlockDesc* block,
     Node* weight,
-    Node** quant_weight,
-    Node** quant_weight_max,
+    Node** dst_weight,
+    Node** dst_weight_max,
     bool transpose,
     const std::vector<float>& weight_scales);
 
-template void PrepareWeight<int8_t, int8_t>(
+template void PrepareWeight<float, int8_t>(
     Graph* graph,
     Scope* scope,
     BlockDesc* block,
     Node* weight,
-    Node** quant_weight,
-    Node** quant_weight_max,
+    Node** dst_weight,
+    Node** dst_weight_max,
     bool transpose,
     const std::vector<float>& weight_scales);
 
-template <typename T>
-void PrepareWeight(Graph* graph,
-                   Scope* scope,
-                   BlockDesc* block,
-                   Node* src,
-                   Node** dst,
-                   Node** dst_max,
-                   bool transpose) {
-  auto src_name = src->Name();
-  auto* src_tensor = scope->Var(src_name)->GetMutable<phi::DenseTensor>();
-
-  phi::DenseTensor dst_tensor;
-  Assign(*src_tensor, &dst_tensor);
-  phi::DenseTensor dst_max_tensor;
-  PrepareWeight<T>(&dst_tensor, &dst_max_tensor, transpose);
-
-  size_t dst_hash = HashTensor<T>(dst_tensor);
-  size_t dst_max_hash = HashTensor<float>(dst_max_tensor);
-  std::string pre_name = GetPrefixWithoutHash(src_name);
-  std::string dst_name = pre_name + "_#" + std::to_string(dst_hash);
-  std::string dst_max_name = pre_name + "_max_#" + std::to_string(dst_max_hash);
-  *dst = FindNodeWithName(graph, dst_name);
-  if (*dst == nullptr) {
-    // Create dst node
-    // Update dst var_desc in block
-    VarDesc dst_desc(dst_name);
-    dst_desc.SetPersistable(true);
-    dst_desc.SetShape(vectorize(dst_tensor.dims()));
-    dst_desc.SetDataType(framework::TransToProtoVarType(dst_tensor.dtype()));
-    *dst = graph->CreateVarNode(&dst_desc);
-    auto* block_dst_desc = block->Var(dst_name);
-    block_dst_desc->SetPersistable(dst_desc.Persistable());
-    block_dst_desc->SetShape(dst_desc.GetShape());
-    block_dst_desc->SetDataType(dst_desc.GetDataType());
-    // Create dst_max node
-    // Update dst_max var_desc in block
-    VarDesc dst_max_desc(dst_max_name);
-    dst_max_desc.SetPersistable(true);
-    dst_max_desc.SetShape(vectorize(dst_max_tensor.dims()));
-    dst_max_desc.SetDataType(proto::VarType::Type::VarType_Type_FP32);
-    *dst_max = graph->CreateVarNode(&dst_max_desc);
-    auto* block_dst_max_desc = block->Var(dst_max_name);
-    block_dst_max_desc->SetPersistable(dst_max_desc.Persistable());
-    block_dst_max_desc->SetShape(dst_max_desc.GetShape());
-    block_dst_max_desc->SetDataType(dst_max_desc.GetDataType());
-
-    // Find dst/dst_max variable in scope
-    auto* dst_var = scope->FindVar(dst_name);
-    if (dst_var == nullptr) {
-      // Create dst/dst_max variable/tensor
-      Assign(dst_tensor, scope->Var(dst_name)->GetMutable<phi::DenseTensor>());
-      Assign(dst_max_tensor,
-             scope->Var(dst_max_name)->GetMutable<phi::DenseTensor>());
-    } else {
-      // Share the same variable
-      PADDLE_ENFORCE_NOT_NULL(
-          scope->FindVar(dst_max_name),
-          platform::errors::Fatal(
-              "dst_max(%s) variable should not be nullptr if dst(%s) "
-              "variable is exist. (src_name is %s)",
-              dst_max_name,
-              dst_name,
-              src_name));
-    }
-  } else {
-    *dst_max = FindNodeWithName(graph, dst_max_name);
-    PADDLE_ENFORCE_NOT_NULL(
-        *dst_max,
-        platform::errors::Fatal(
-            "dst_max(%s) variable should not be nullptr if dst(%s) "
-            "variable is exist. (src_name is %s)",
-            dst_max_name,
-            dst_name,
-            src_name));
-  }
-}
-
-template void PrepareWeight<int16_t>(Graph* graph,
-                                     Scope* scope,
-                                     BlockDesc* block,
-                                     Node* src,
-                                     Node** dst,
-                                     Node** dst_max,
-                                     bool transpose);
-template void PrepareWeight<int8_t>(Graph* graph,
-                                    Scope* scope,
-                                    BlockDesc* block,
-                                    Node* src,
-                                    Node** dst,
-                                    Node** dst_max,
-                                    bool transpose);
+template void PrepareWeight<int8_t, int8_t>(
+    Graph* graph,
+    Scope* scope,
+    BlockDesc* block,
+    Node* weight,
+    Node** dst_weight,
+    Node** dst_weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
 
 void PrepareBias(
     Graph* graph, Scope* scope, BlockDesc* block, Node* src, Node** dst) {
diff --git a/paddle/fluid/framework/ir/xpu/pass_utils.h b/paddle/fluid/framework/ir/xpu/pass_utils.h
index 417ba361e4348..668519c8eb406 100644
--- a/paddle/fluid/framework/ir/xpu/pass_utils.h
+++ b/paddle/fluid/framework/ir/xpu/pass_utils.h
@@ -79,22 +79,16 @@ void ConvertWeightWrapper(phi::DenseTensor* weight,
   ConvertWithoutQuant<Tcpu>(weight, weight_max, transpose, weight_scales);
 }
 
-template <typename T>
-void PrepareWeight(Graph* graph,
-                   Scope* scope,
-                   BlockDesc* block,
-                   Node* src,
-                   Node** dst,
-                   Node** dst_max,
-                   bool transpose);
-
+// 1. Quant weight from fp32 to int16/int31/int8
+// 2. Weight data is in-place update.
+// 3. Generate weight max tensor
 template <typename Tcpu, typename Txpu>
 void PrepareWeight(Graph* graph,
                    Scope* scope,
                    BlockDesc* block,
                    Node* weight,
-                   Node** quant_weight,
-                   Node** quant_weight_max,
+                   Node** dst_weight,
+                   Node** dst_weight_max,
                    bool transpose,
                    const std::vector<float>& weight_scales);
 
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc
index 08c1da2148687..a137a006e9f70 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.cc
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc
@@ -281,11 +281,6 @@ void ConvertWithQuant(phi::DenseTensor* weight,
                       phi::DenseTensor* weight_max,
                       bool transpose,
                       const std::vector<float>& weight_scales) {
-  if (!weight_scales.empty()) {
-    LOG(FATAL) << "Weight scales should be empty(), otherwise, check if your "
-                  "model is quant model or not.";
-  }
-
   // Convert fp16 to fp32
   phi::DenseTensor weight_fp32;
   CastToFp32(weight, &weight_fp32);
@@ -349,51 +344,17 @@ template void ConvertWithQuant<float, int16_t>(
     bool transpose,
     const std::vector<float>& weight_scales);
 
-template void ConvertWithoutQuant<int8_t>(
+template void ConvertWithQuant<float, int8_t>(
     phi::DenseTensor* weight,
     phi::DenseTensor* weight_max,
     bool transpose,
     const std::vector<float>& weight_scales);
 
-template <typename T>
-void PrepareWeight(phi::DenseTensor* weight,
-                   phi::DenseTensor* weight_max,
-                   bool transpose) {
-  // Convert fp16 to fp32
-  phi::DenseTensor weight_fp32;
-  CastToFp32(weight, &weight_fp32);
-
-  // Transpose
-  if (transpose) {
-    Transpose2D(&weight_fp32);
-  }
-
-  // Find max
-  int max_ptr_size = phi::backends::xpu::get_xpu_max_ptr_size(-1);
-  int size = weight_fp32.numel();
-  auto* weight_data = weight_fp32.data<float>();
-  float max_val = FindMaxAbs(weight_data, size);
-  std::vector<float> max_vec(max_ptr_size, max_val);
-  weight_max->set_type(phi::DataType::FLOAT32);
-  weight_max->Resize({max_ptr_size});
-  auto* cpu_ctx = static_cast<phi::CPUContext*>(
-      platform::DeviceContextPool::Instance().Get(phi::CPUPlace()));
-  memcpy(cpu_ctx->Alloc<float>(weight_max),
-         max_vec.data(),
-         max_ptr_size * sizeof(float));
-
-  // Quant
-  weight->set_type(phi::CppTypeToDataType<T>::Type());
-  weight->Resize(weight_fp32.dims());
-  QuantFP32ToIntX(weight_data, cpu_ctx->Alloc<T>(weight), max_val, size);
-}
-
-template void PrepareWeight<int16_t>(phi::DenseTensor* weight,
-                                     phi::DenseTensor* weight_max,
-                                     bool transpose);
-template void PrepareWeight<int8_t>(phi::DenseTensor* weight,
-                                    phi::DenseTensor* weight_max,
-                                    bool transpose);
+template void ConvertWithoutQuant<int8_t>(
+    phi::DenseTensor* weight,
+    phi::DenseTensor* weight_max,
+    bool transpose,
+    const std::vector<float>& weight_scales);
 
 bool IsPerTensorQuant(const std::vector<float>& weight_max) {
   bool per_tensor = true;
diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h
index b564bcac7202d..1a2952c614542 100644
--- a/paddle/fluid/framework/ir/xpu/quant_utils.h
+++ b/paddle/fluid/framework/ir/xpu/quant_utils.h
@@ -51,14 +51,6 @@ void ConvertWithQuant(phi::DenseTensor* weight,
                       bool transpose,
                       const std::vector<float>& weight_scales);
 
-// 1. Quant weight from fp32 to int16/int31
-// 2. Weight data is in-place update.
-// 3. Generate weight max tensor
-template <typename T>
-void PrepareWeight(phi::DenseTensor* weight,
-                   phi::DenseTensor* weight_max,
-                   bool transpose);
-
 bool IsPerTensorQuant(const std::vector<float>& weight_max);
 
 }  // namespace ir
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 2561e14d06d1e..0af6876faca05 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -13,13 +13,8 @@ cc_library(
 cc_library(
   convert_to_mixed_precision
   SRCS convert_to_mixed_precision.cc
-  DEPS analysis_pass
-       ir_graph_build_pass
-       auto_mixed_precision_pass
-       constant_folding_pass
-       identity_op_clean_pass
-       delete_quant_dequant_linear_op_pass
-       delete_weight_dequant_linear_op_pass)
+  DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass
+       constant_folding_pass identity_op_clean_pass)
 cc_library(
   ir_params_sync_among_devices_pass
   SRCS ir_params_sync_among_devices_pass.cc
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 3aeeff498a52f..d706113307009 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -95,7 +95,6 @@ void ConvertToMixedPrecisionPass::Run() {
   framework::ir::AutoMixedPrecisionPass auto_mixed_precision_pass;
   auto_mixed_precision_pass.Set("mixed_precision_mode",
                                 new int{static_cast<int>(mixed_precision_)});
-
   if (backend_ == phi::Backend::GPU) {
     auto_mixed_precision_pass.Set("enable_gpu_mixed", new bool{true});
   } else if (backend_ == phi::Backend::XPU) {

From 426c36b117636f1e9ca05139dabeccbfd4628dcb Mon Sep 17 00:00:00 2001
From: csy0225 <chensiyu08@baidu.com>
Date: Wed, 25 Oct 2023 15:15:30 +0800
Subject: [PATCH 15/15] move dequantize/quantize ops yaml pos

---
 paddle/phi/api/yaml/fused_ops.yaml | 18 ++++++++++++++++++
 paddle/phi/api/yaml/ops.yaml       | 18 ------------------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
index cf1cc6ebcd295..a86aac572c263 100644
--- a/paddle/phi/api/yaml/fused_ops.yaml
+++ b/paddle/phi/api/yaml/fused_ops.yaml
@@ -71,6 +71,15 @@
     data_type : x
   optional : bias, branch, branch_max ,x_max, scale_max, out_max_in
 
+- op : dequantize_xpu
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
+  output : Tensor(y)
+  infer_meta :
+    func : DeQuantizeXPUInferMeta
+  kernel :
+    func : dequantize_xpu
+    data_type: x
+
 - op : embedding_with_eltwise_add_xpu
   args : (Tensor[] ids, Tensor[] tables, Tensor mask, int64_t padding_idx)
   output: Tensor(out), Tensor(seq_lod), Tensor(max_seq_len)
@@ -254,6 +263,15 @@
     data_type : input
   optional : bias_qk
 
+- op : quantize_xpu
+  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
+  output : Tensor(y)
+  infer_meta :
+    func : QuantizeXPUInferMeta
+  kernel :
+    func : quantize_xpu
+    data_type : x
+
 - op : squeeze_excitation_block
   args : (Tensor x, Tensor filter, Tensor filter_max, Tensor bias, Tensor branch, int[] act_type, float[] act_param, int[] filter_dims)
   output : Tensor(out)
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
index 33b00e58ce841..aaf6c4e1445ef 100644
--- a/paddle/phi/api/yaml/ops.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -657,15 +657,6 @@
     func : depthwise_conv2d
   backward : depthwise_conv2d_grad
 
-- op : dequantize_xpu
-  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
-  output : Tensor(y)
-  infer_meta :
-    func : DeQuantizeXPUInferMeta
-  kernel :
-    func : dequantize_xpu
-    data_type: x
-
 - op : det
   args : (Tensor x)
   output : Tensor
@@ -2048,15 +2039,6 @@
     func : qr
   backward : qr_grad
 
-- op : quantize_xpu
-  args : (Tensor x, DataType out_dtype, float scale = 1.0f)
-  output : Tensor(y)
-  infer_meta :
-    func : QuantizeXPUInferMeta
-  kernel :
-    func : quantize_xpu
-    data_type : x
-
 - op : real
   args : (Tensor x)
   output : Tensor (out)