From bd117a064e4875c64479c8d9992b98cf62b41b5f Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Thu, 9 Mar 2023 10:29:43 +0100
Subject: [PATCH 01/23] Fused elementwises kernels and ops

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 ...se_pass.cc => elt_act_onednn_fuse_pass.cc} |   7 +-
 ...fuse_pass.h => elt_act_onednn_fuse_pass.h} |   0
 .../framework/ir/mkldnn/mkldnn_pass_util.h    |   6 +-
 .../inference/api/paddle_pass_builder.cc      |   2 +-
 .../compat/fused_elementwise_add.pbtxt        |  50 ++++
 .../compat/fused_elementwise_div.pbtxt        |  50 ++++
 .../compat/fused_elementwise_mul.pbtxt        |  50 ++++
 .../compat/fused_elementwise_sub.pbtxt        |  51 +++++
 .../fused/fused_elementwise_add_op.cc         |  56 +++++
 .../fused/fused_elementwise_div_op.cc         |  56 +++++
 .../fused/fused_elementwise_mul_op.cc         |  56 +++++
 .../operators/fused/fused_elementwise_op.h    | 119 ++++++++++
 .../fused/fused_elementwise_sub_op.cc         |  56 +++++
 .../fusion/onednn/fused_elementwise_kernel.cc | 213 ++++++++++++++++++
 .../phi/kernels/onednn/elementwise_kernel.cc  |  29 +--
 .../phi/ops/compat/fused_elementwise_sig.cc   | 145 ++++++++++++
 .../test_mkldnn_elt_act_fuse_pass.py          |   2 +-
 ...nn_elementwise_add_activation_fuse_pass.py |   6 +-
 19 files changed, 921 insertions(+), 35 deletions(-)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_mkldnn_fuse_pass.cc => elt_act_onednn_fuse_pass.cc} (95%)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_mkldnn_fuse_pass.h => elt_act_onednn_fuse_pass.h} (100%)
 create mode 100644 paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
 create mode 100644 paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
 create mode 100644 paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
 create mode 100644 paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_add_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_div_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_op.h
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_sub_op.cc
 create mode 100644 paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
 create mode 100644 paddle/phi/ops/compat/fused_elementwise_sig.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index e602b899fe62e6..13266b12a3b9ff 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -175,7 +175,7 @@ if(WITH_MKLDNN)
   pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
   pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(elt_act_onednn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
index 4b3f6a95d6dd17..9999ee3e09a3d0 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
@@ -62,6 +62,7 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
     GET_IR_NODE_FROM_SUBGRAPH(
         activation_out, activation_out, elementwise_act_pattern);
 
+    ConvertToFusedOp(elementwise->Op());
     SetActivationAttrs(elementwise->Op(), activation->Op(), act_type);
     elementwise->Op()->SetOutput("Out", {activation_out->Name()});
 
@@ -84,9 +85,9 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(elt_act_mkldnn_fuse_pass,
+REGISTER_PASS(elt_act_onednn_fuse_pass,
               paddle::framework::ir::ElementwiseActivationOneDNNPass);
-REGISTER_PASS_CAPABILITY(elt_act_mkldnn_fuse_pass)
+REGISTER_PASS_CAPABILITY(elt_act_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("elementwise_add", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_mkldnn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index 142bb9adb68dca..f946948428a4a2 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -160,7 +160,11 @@ inline void ConvertToFusedOp(OpDesc* op) {
       {"conv2d", "fused_conv2d"},
       {"depthwise_conv2d", "fused_conv2d"},
       {"matmul", "fused_matmul"},
-      {"matmul_v2", "fused_matmul"}};
+      {"matmul_v2", "fused_matmul"},
+      {"elementwise_add", "fused_elementwise_add"},
+      {"elementwise_sub", "fused_elementwise_sub"},
+      {"elementwise_mul", "fused_elementwise_mul"},
+      {"elementwise_mul", "fused_elementwise_div"}};
 
   if (op->Type() == "matmul") {
     op->SetAttr("trans_x", op->GetAttr("transpose_X"));
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 23fdaf3ddff334..562c3502ce6d32 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -377,7 +377,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
              "shuffle_channel_mkldnn_detect_pass",    //
-             "elt_act_mkldnn_fuse_pass",              //
+             "elt_act_onednn_fuse_pass",              //
              "layer_norm_onednn_optimization_pass",   //
              "operator_scale_onednn_fuse_pass",       //
              "operator_unsqueeze2_onednn_fuse_pass",  //
diff --git a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
new file mode 100644
index 00000000000000..3514e06839bdc1
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
@@ -0,0 +1,50 @@
+type: "fused_elementwise_add"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "beta"
+    type: FLOAT
+  }  
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
new file mode 100644
index 00000000000000..0690a2983e9688
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
@@ -0,0 +1,50 @@
+type: "fused_elementwise_div"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "beta"
+    type: FLOAT
+  }  
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
new file mode 100644
index 00000000000000..5a1a27d38fc0fd
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
@@ -0,0 +1,50 @@
+type: "fused_elementwise_mul"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "beta"
+    type: FLOAT
+  }  
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+}
diff --git a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
new file mode 100644
index 00000000000000..c12b3ca5f3a41b
--- /dev/null
+++ b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
@@ -0,0 +1,51 @@
+type: "fused_elementwise_sub"
+def {
+  inputs {
+    name: "X"
+  }
+  inputs {
+    name: "Y"
+  }
+  outputs {
+    name: "Out"
+  }
+  attrs {
+    name: "axis"
+    type: INT
+  }
+}
+extra {
+  attrs {
+    name: "x_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "y_data_format"
+    type: STRING
+  }
+  attrs {
+    name: "alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "beta"
+    type: FLOAT
+  }  
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  }
+}
diff --git a/paddle/fluid/operators/fused/fused_elementwise_add_op.cc b/paddle/fluid/operators/fused/fused_elementwise_add_op.cc
new file mode 100644
index 00000000000000..e8b689a9eb1d37
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_add_op.cc
@@ -0,0 +1,56 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseAddOpMaker : public FusedElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Add"; }
+  std::string GetEquation() const override { return "Out = X + Y"; }
+
+  void AddInputX() override {
+    AddInput(
+        "X",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  void AddInputY() override {
+    AddInput(
+        "Y",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Add two tensors element-wise";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_elementwise_add,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseAddOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_div_op.cc b/paddle/fluid/operators/fused/fused_elementwise_div_op.cc
new file mode 100644
index 00000000000000..3f36ab85e73718
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_div_op.cc
@@ -0,0 +1,56 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseDivOpMaker : public FusedElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Div"; }
+  std::string GetEquation() const override { return "Out = X / Y"; }
+
+  void AddInputX() override {
+    AddInput(
+        "X",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  void AddInputY() override {
+    AddInput(
+        "Y",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Divide two tensors element-wise";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_elementwise_div,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseDivOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc b/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
new file mode 100644
index 00000000000000..92a73e1df56d14
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
@@ -0,0 +1,56 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseMulOpMaker : public FusedElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Mul"; }
+  std::string GetEquation() const override { return "Out = X \\\\odot Y"; }
+
+  void AddInputX() override {
+    AddInput(
+        "X",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  void AddInputY() override {
+    AddInput(
+        "Y",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Multiply two tensors element-wise";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_elementwise_mul,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseMulOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.h b/paddle/fluid/operators/fused/fused_elementwise_op.h
new file mode 100644
index 00000000000000..ca1e368fd06625
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.h
@@ -0,0 +1,119 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInputX();
+    AddInputY();
+    AddOpOutput();
+    AddAttr<int>("axis",
+                 "(int, default -1). If X.dimension != Y.dimension,"
+                 "Y.dimension must be a subsequence of x.dimension. And axis "
+                 "is the start dimension index "
+                 "for broadcasting Y onto X. ")
+        .SetDefault(-1);
+
+    AddAttr<float>("Alfa", "Alfa value for the elementwise operator")
+        .SetDefault(1.0f);
+    AddAttr<float>("Beta", "Beta value for the elementwise operator")
+        .SetDefault(1.0f);
+
+    AddAttr<std::string>("fuse_activation",
+                         "Activation type from elt_act_onednn_fuse_pass")
+        .SetDefault("");
+
+    AddAttr<float>("Scale_x", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_y", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+    AddAttr<float>("Scale_out", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+    AddOpComment();
+  }
+
+ protected:
+  virtual void AddInputX() {
+    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
+  }
+  virtual void AddInputY() {
+    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
+  }
+  virtual void AddOpOutput() {
+    AddOutput("Out",
+              "N-dimension tensor. A location into which the result is stored. "
+              "It's dimension "
+              "equals with x");
+  }
+  virtual void AddOpComment() { AddComment(GetCommentExamples()); }
+
+  virtual std::string GetOpFuntionality() const { return ""; }
+
+  virtual std::string GetName() const = 0;
+  virtual std::string GetEquation() const = 0;
+
+  std::string GetCommentExamples() const {
+    return string::Sprintf(R"DOC(
+Elementwise %s Operator.
+
+%s
+
+The equation is:
+
+$$%s$$
+
+- $X$: a tensor of any dimension.
+- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
+
+There are two cases for this operator:
+
+1. The shape of $Y$ is the same with $X$.
+2. The shape of $Y$ is a continuous subsequence of $X$.
+
+For case 2:
+
+1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
+   for broadcasting $Y$ onto $X$.
+2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
+3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
+   subsequence, such as shape(Y) = (2, 1) => (2).
+
+For example:
+
+  .. code-block:: text
+
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
+
+)DOC",
+                           GetName(),
+                           GetOpFuntionality(),
+                           GetEquation());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc b/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc
new file mode 100644
index 00000000000000..0b345134f663f7
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc
@@ -0,0 +1,56 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseSubOpMaker : public FusedElementwiseOpMaker {
+ protected:
+  std::string GetName() const override { return "Sub"; }
+  std::string GetEquation() const override { return "Out = X - Y"; }
+
+  void AddInputX() override {
+    AddInput(
+        "X",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  void AddInputY() override {
+    AddInput(
+        "Y",
+        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
+        "should be int32, int64, float32, float64.");
+  }
+
+  std::string GetOpFuntionality() const override {
+    return "Subtract two tensors element-wise";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(
+    fused_elementwise_sub,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseSubOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
new file mode 100644
index 00000000000000..b10b3902b0bdba
--- /dev/null
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -0,0 +1,213 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, dnnl::algorithm BINARY_OP>
+void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
+                            const DenseTensor& x,
+                            const DenseTensor& y,
+                            int axis,
+                            DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  float scale_x = dev_ctx.HasDnnAttr("Scale_x")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
+                      : 1.0f;
+  float scale_y = dev_ctx.HasDnnAttr("Scale_y")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
+                      : 1.0f;
+  float scale_out =
+      dev_ctx.HasDnnAttr("Scale_out")
+          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
+          : 1.0f;
+
+  dnnl::post_ops post_operations;
+  funcs::AppendActivation(dev_ctx, post_operations);
+  if (dev_ctx.HasDnnAttr("fused_output_scale")) {
+    float scale_alpha =
+        PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale"));
+    post_operations.append_eltwise(
+        1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+  }
+
+  auto* non_const_x = &x;
+  auto* non_const_y = &y;
+
+  funcs::BinaryOneDNNHandler<T> handler(BINARY_OP,
+                                        axis,
+                                        onednn_engine,
+                                        dev_ctx.GetPlace(),
+                                        non_const_x,
+                                        non_const_y,
+                                        out,
+                                        scale_x,
+                                        scale_y,
+                                        scale_out,
+                                        true,
+                                        post_operations);
+
+  // oneDNN's binary is optimized for broadcasting y into x, so in other case
+  // we have to swap tensors to achieve optimal performance
+  if (x.numel() < y.numel()) {
+    std::swap(non_const_x, non_const_y);
+  }
+
+  const auto src_x_memory = handler.AcquireSrcMemory(non_const_x);
+  const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y);
+  // (jczaja) For Inplace src and dst should be the same memory object.
+  // So x should share buffer with z. But UT mechanics is testing inplace
+  // execution for this op not checking that x can be bradcasted to match in
+  // shape y tensor.
+  // This is wrong as when x is to be broadcasted then z(out) will match the
+  // shape of y which is bigger than x. Hence if x is smaller in shape than z
+  // and they share a buffer (of
+  // shape x) then this buffer is not big enough to hold result of elementwise
+  // operation.
+  const bool reuse_x_memory = non_const_x->numel() == out->numel() &&
+                              non_const_x->IsSharedBufferWith(*out);
+  std::shared_ptr<dnnl::memory> dst_memory;
+
+  if (reuse_x_memory) {
+    dst_memory = src_x_memory;
+    // NOTE(chenfeiyu): when the output reuses memory from other tensor rather
+    // than allocate its own, it's still need to take care of its data type.
+    // Unfortunately, paddle's operator only infers the output' shape, but not
+    // the data type. Alloc<T> takes care of allocation and data type
+    // normally, but if the memory is already allocated and there is no need
+    // to re-allocate, it just set the data type. So this it added there to
+    // get the right data type.
+    dev_ctx.template Alloc<T>(out);
+  } else {
+    dst_memory = handler.AcquireDstMemory(out);
+  }
+
+  const auto binary_prim = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  const std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC_0, *src_x_memory},
+      {DNNL_ARG_SRC_1, *src_y_memory},
+      {DNNL_ARG_DST, *dst_memory}};
+
+  binary_prim->execute(astream, args);
+  astream.wait();
+
+  if (handler.use_broadcasting_hack == false) {
+    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
+        dev_ctx, out, dst_memory->get_desc());
+  } else {
+    auto dims = dst_memory->get_desc().dims();
+    dims.insert(dims.begin(), non_const_x->dims()[0]);
+    dims[1] /= dims[0];
+    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
+        dev_ctx, out, dst_memory->get_desc().reshape(dims));
+  }
+}
+
+#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)           \
+  template <typename T, typename Context>                           \
+  void name##RawKernel(const Context& dev_ctx,                      \
+                       const DenseTensor& x,                        \
+                       const DenseTensor& y,                        \
+                       int axis,                                    \
+                       DenseTensor* out) {                          \
+    FusedElementwiseKernel<T, algorithm>(dev_ctx, x, y, axis, out); \
+  }                                                                 \
+  template <typename T, typename Context>                           \
+  void name##Kernel(const Context& dev_ctx,                         \
+                    const DenseTensor& x,                           \
+                    const DenseTensor& y,                           \
+                    DenseTensor* out) {                             \
+    FusedElementwiseKernel<T, algorithm>(dev_ctx, x, y, -1, out);   \
+  }
+
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedAdd, dnnl::algorithm::binary_add)
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedSubtract, dnnl::algorithm::binary_sub)
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedMultiply, dnnl::algorithm::binary_mul)
+DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_add_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedAddRawKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_add,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedAddKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_subtract_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedSubtractRawKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_subtract,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedSubtractKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_multiply_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedMultiplyRawKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_multiply,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedMultiplyKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
+
+PD_REGISTER_KERNEL(fused_divide_raw,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedDivideRawKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(fused_divide,
+                   OneDNN,
+                   ONEDNN,
+                   phi::FusedDivideKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc
index b786da7a319156..fc1255c5b8cb27 100644
--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -30,26 +30,6 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                        DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  float scale_x = dev_ctx.HasDnnAttr("Scale_x")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
-                      : 1.0f;
-  float scale_y = dev_ctx.HasDnnAttr("Scale_y")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
-                      : 1.0f;
-  float scale_out =
-      dev_ctx.HasDnnAttr("Scale_out")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
-          : 1.0f;
-
-  dnnl::post_ops post_operations;
-  funcs::AppendActivation(dev_ctx, post_operations);
-  if (dev_ctx.HasDnnAttr("fused_output_scale")) {
-    float scale_alpha =
-        PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale"));
-    post_operations.append_eltwise(
-        1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
-  }
-
   auto* non_const_x = &x;
   auto* non_const_y = &y;
 
@@ -60,11 +40,10 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                                         non_const_x,
                                         non_const_y,
                                         out,
-                                        scale_x,
-                                        scale_y,
-                                        scale_out,
-                                        true,
-                                        post_operations);
+                                        1.0f,
+                                        1.0f,
+                                        1.0f,
+                                        true);
 
   // oneDNN's binary is optimized for broadcasting y into x, so in other case
   // we have to swap tensors to achieve optimal performance
diff --git a/paddle/phi/ops/compat/fused_elementwise_sig.cc b/paddle/phi/ops/compat/fused_elementwise_sig.cc
new file mode 100644
index 00000000000000..4b1b9599ab32d7
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_elementwise_sig.cc
@@ -0,0 +1,145 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FusedElementwiseAddOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("fused_add",
+                           {"X", "Y"},
+                           {"alpha",
+                            "beta",
+                            "fuse_activation",
+                            "mkldnn_data_type",
+                            "Scale_x",
+                            "Scale_y",
+                            "Scale_out"},
+                           {"Out"});
+  }
+  return KernelSignature("fused_add_raw",
+                         {"X", "Y"},
+                         {"axis",
+                          "alpha",
+                          "beta",
+                          "fuse_activation",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_out"},
+                         {"Out"});
+}
+
+KernelSignature FusedElementwiseSubOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("fused_subtract",
+                           {"X", "Y"},
+                           {"alpha",
+                            "beta",
+                            "fuse_activation",
+                            "mkldnn_data_type",
+                            "Scale_x",
+                            "Scale_y",
+                            "Scale_out"},
+                           {"Out"});
+  }
+  return KernelSignature("fused_subtract_raw",
+                         {"X", "Y"},
+                         {"axis",
+                          "alpha",
+                          "beta",
+                          "fuse_activation",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_out"},
+                         {"Out"});
+}
+
+KernelSignature FusedElementwiseMulOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("fused_multiply",
+                           {"X", "Y"},
+                           {"alpha",
+                            "beta",
+                            "fuse_activation",
+                            "mkldnn_data_type",
+                            "Scale_x",
+                            "Scale_y",
+                            "Scale_out"},
+                           {"Out"});
+  }
+  return KernelSignature("fused_multiply_raw",
+                         {"X", "Y"},
+                         {"axis",
+                          "alpha",
+                          "beta",
+                          "fuse_activation",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_out"},
+                         {"Out"});
+}
+
+KernelSignature FusedElementwiseDivOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
+  if (axis == -1) {
+    return KernelSignature("fused_divide",
+                           {"X", "Y"},
+                           {"alpha",
+                            "beta",
+                            "fuse_activation",
+                            "mkldnn_data_type",
+                            "Scale_x",
+                            "Scale_y",
+                            "Scale_out"},
+                           {"Out"});
+  }
+  return KernelSignature("fused_divide_raw",
+                         {"X", "Y"},
+                         {"axis",
+                          "alpha",
+                          "beta",
+                          "fuse_activation",
+                          "mkldnn_data_type",
+                          "Scale_x",
+                          "Scale_y",
+                          "Scale_out"},
+                         {"Out"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_add, fused_add);
+PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_sub, fused_subtract);
+PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_mul, fused_multiply);
+PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_div, fused_divide);
+
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_add,
+                           phi::FusedElementwiseAddOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_sub,
+                           phi::FusedElementwiseSubOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_mul,
+                           phi::FusedElementwiseMulOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_div,
+                           phi::FusedElementwiseDivOpArgumentMapping);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index 24a63751cfec43..3315534821dc02 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -26,7 +26,7 @@
 class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
     act_alpha = None
     act_beta = None
-    pass_name = 'elt_act_mkldnn_fuse_pass'
+    pass_name = 'elt_act_onednn_fuse_pass'
 
     def setUp(self):
         self.set_params()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
index 93165f829b1c8a..92341facaaf217 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
@@ -118,17 +118,17 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'elt_act_mkldnn_fuse_pass',
+                'elt_act_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
-        yield config, ['elementwise_add'], (1e-5, 1e-5)
+        yield config, ['fused_elementwise_add'], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'elt_act_mkldnn_fuse_pass',
+                'elt_act_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )

From 25256810c8b49dc583cbf5f2414ece223ff5fe67 Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Thu, 9 Mar 2023 14:09:08 +0100
Subject: [PATCH 02/23] change fuse pass name

---
 paddle/fluid/framework/ir/CMakeLists.txt       |  2 +-
 ...elementwise_activation_onednn_fuse_pass.cc} |  6 +++---
 ... elementwise_activation_onednn_fuse_pass.h} |  0
 .../fluid/inference/api/paddle_pass_builder.cc | 18 +++++++++---------
 .../operators/fused/fused_elementwise_op.h     |  5 +++--
 .../inference/test_mkldnn_elt_act_fuse_pass.py |  2 +-
 ...dnn_elementwise_add_activation_fuse_pass.py |  4 ++--
 7 files changed, 19 insertions(+), 18 deletions(-)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_onednn_fuse_pass.cc => elementwise_activation_onednn_fuse_pass.cc} (94%)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_onednn_fuse_pass.h => elementwise_activation_onednn_fuse_pass.h} (100%)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index aa57c267c0c0f3..6f19b642a34be7 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -175,7 +175,7 @@ if(WITH_MKLDNN)
   pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
   pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(elt_act_onednn_fuse_pass inference DIR mkldnn)
+  pass_library(elementwise_activation_onednn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.cc
similarity index 94%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.cc
index 9999ee3e09a3d0..2725dae2820572 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
@@ -85,9 +85,9 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(elt_act_onednn_fuse_pass,
+REGISTER_PASS(elementwise_activation_onednn_fuse_pass,
               paddle::framework::ir::ElementwiseActivationOneDNNPass);
-REGISTER_PASS_CAPABILITY(elt_act_onednn_fuse_pass)
+REGISTER_PASS_CAPABILITY(elementwise_activation_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("elementwise_add", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/elementwise_activation_onednn_fuse_pass.h
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index c1fa7654005239..1fa42cbbe5fa80 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -373,15 +373,15 @@ void CpuPassStrategy::EnableMKLDNN() {
              // Disabled due to topology-dependent speed-up
              "fc_mkldnn_pass",
              "fc_act_mkldnn_fuse_pass",
-             "fc_elementwise_add_mkldnn_fuse_pass",   //
-             "batch_norm_act_fuse_pass",              //
-             "softplus_activation_mkldnn_fuse_pass",  //
-             "shuffle_channel_mkldnn_detect_pass",    //
-             "elt_act_onednn_fuse_pass",              //
-             "layer_norm_onednn_optimization_pass",   //
-             "operator_scale_onednn_fuse_pass",       //
-             "operator_unsqueeze2_onednn_fuse_pass",  //
-             "operator_reshape2_onednn_fuse_pass",    //
+             "fc_elementwise_add_mkldnn_fuse_pass",      //
+             "batch_norm_act_fuse_pass",                 //
+             "softplus_activation_mkldnn_fuse_pass",     //
+             "shuffle_channel_mkldnn_detect_pass",       //
+             "elementwise_activation_onednn_fuse_pass",  //
+             "layer_norm_onednn_optimization_pass",      //
+             "operator_scale_onednn_fuse_pass",          //
+             "operator_unsqueeze2_onednn_fuse_pass",     //
+             "operator_reshape2_onednn_fuse_pass",       //
          })) {
       passes_.push_back(pass);
     }
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.h b/paddle/fluid/operators/fused/fused_elementwise_op.h
index ca1e368fd06625..730623d078ad4f 100644
--- a/paddle/fluid/operators/fused/fused_elementwise_op.h
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.h
@@ -38,8 +38,9 @@ class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("Beta", "Beta value for the elementwise operator")
         .SetDefault(1.0f);
 
-    AddAttr<std::string>("fuse_activation",
-                         "Activation type from elt_act_onednn_fuse_pass")
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from elementwise_activation_onednn_fuse_pass")
         .SetDefault("");
 
     AddAttr<float>("Scale_x", "Obtained from cpu_quantize_pass")
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
index 3315534821dc02..721c2f19455fbe 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
@@ -26,7 +26,7 @@
 class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
     act_alpha = None
     act_beta = None
-    pass_name = 'elt_act_onednn_fuse_pass'
+    pass_name = 'elementwise_activation_onednn_fuse_pass'
 
     def setUp(self):
         self.set_params()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
index 92341facaaf217..4565560f49b9d9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
@@ -118,7 +118,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'elt_act_onednn_fuse_pass',
+                'elementwise_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
@@ -128,7 +128,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'elt_act_onednn_fuse_pass',
+                'elementwise_activation_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )

From 3c8b7d3d72d9f9190d38633a562e9e26d29f5e41 Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Thu, 9 Mar 2023 14:27:27 +0100
Subject: [PATCH 03/23] adjust .pbtxt files

---
 .../operators/compat/elementwise_add.pbtxt    | 20 ----------
 .../operators/compat/elementwise_div.pbtxt    | 20 ----------
 .../operators/compat/elementwise_mul.pbtxt    | 22 -----------
 .../operators/compat/elementwise_pow.pbtxt    | 20 ----------
 .../operators/compat/elementwise_sub.pbtxt    | 20 ----------
 .../compat/fused_elementwise_add.pbtxt        | 22 +++++------
 .../compat/fused_elementwise_div.pbtxt        | 30 +-------------
 .../compat/fused_elementwise_mul.pbtxt        | 22 +++++------
 .../compat/fused_elementwise_sub.pbtxt        | 23 +++++------
 paddle/fluid/operators/feed_forward_test.cu   | 10 +----
 paddle/fluid/operators/ops_extra_info.h       |  2 -
 paddle/phi/api/yaml/op_compat.yaml            | 39 +++++++------------
 12 files changed, 42 insertions(+), 208 deletions(-)

diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt
index 5b55f3981c77da..fdafc47dcd12ac 100644
--- a/paddle/fluid/operators/compat/elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
@@ -22,25 +22,5 @@ extra {
   attrs {
     name: "Out0_threshold"
     type: FLOAT
-  }  
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt
index a73d2072029a3a..4baa9dea4fcdd5 100644
--- a/paddle/fluid/operators/compat/elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
@@ -15,26 +15,6 @@ def {
   }
 }
 extra {
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
-  }
   attrs {
     name: "act"
     type: STRING
diff --git a/paddle/fluid/operators/compat/elementwise_mul.pbtxt b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
index 22289e2689c10a..0db1c9c782ae2d 100644
--- a/paddle/fluid/operators/compat/elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
@@ -14,25 +14,3 @@ def {
     type: INT
   }
 }
-extra {
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
-  }
-}
diff --git a/paddle/fluid/operators/compat/elementwise_pow.pbtxt b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
index a2ab73f409b780..018ca70bcbca19 100644
--- a/paddle/fluid/operators/compat/elementwise_pow.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
@@ -15,26 +15,6 @@ def {
   }
 }
 extra {
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
-  }
   attrs {
     name: "act"
     type: STRING
diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
index 9f38601f585ea8..ae8a1352452d27 100644
--- a/paddle/fluid/operators/compat/elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
@@ -15,26 +15,6 @@ def {
   }
 }
 extra {
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
-  }
   attrs {
     name: "act"
     type: STRING
diff --git a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
index 3514e06839bdc1..374170aa4e97fe 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
@@ -16,35 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
+    name: "fuse_activation"
     type: STRING
   }
   attrs {
-    name: "alpha"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "fuse_beta"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
+    name: "fused_output_scale"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "fuse_activation"
-    type: STRING
+    name: "scale_out"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
index 0690a2983e9688..1eb6230dfc36d0 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
@@ -16,35 +16,7 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "alpha"
-    type: FLOAT
-  }
-  attrs {
-    name: "beta"
-    type: FLOAT
-  }  
-  attrs {
-    name: "Scale_x"
+    name: "fused_output_scale"
     type: FLOAT
   }
-  attrs {
-    name: "Scale_y"
-    type: FLOAT
-  }
-  attrs {
-    name: "Scale_out"
-    type: FLOAT
-  }
-  attrs {
-    name: "fuse_activation"
-    type: STRING
-  }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
index 5a1a27d38fc0fd..14b099ab9f8fcb 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
@@ -16,35 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
+    name: "fuse_activation"
     type: STRING
   }
   attrs {
-    name: "alpha"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "fuse_beta"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
+    name: "fused_output_scale"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "fuse_activation"
-    type: STRING
+    name: "scale_out"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
index c12b3ca5f3a41b..1f59922cff1556 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
@@ -16,36 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
+    name: "fuse_activation"
     type: STRING
   }
   attrs {
-    name: "alpha"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "fuse_beta"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
+    name: "fused_output_scale"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "fuse_activation"
-    type: STRING
-  }
+    name: "scale_out"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index 6c4bc4d562cd78..b45c72cd2333bd 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -245,21 +245,13 @@ void GetElementwiseAddOpGrad(const std::vector<T> &dout_vec,
       dout_ptr, dout_vec.data(), size_z * sizeof(T), cudaMemcpyHostToDevice);
 
   int axis = -1;
-  bool use_mkldnn = false, use_quantizer = false;
+  bool use_mkldnn = false;
   std::string mkldnn_data_type = "float32";
-  std::string x_data_format = "", y_data_format = "";
-  float Scale_x = 1.0, Scale_y = 1.0, Scale_out = 1.0;
 
   framework::AttributeMap attrs;
   attrs.insert({"axis", axis});
   attrs.insert({"use_mkldnn", use_mkldnn});
-  attrs.insert({"x_data_format", x_data_format});
-  attrs.insert({"y_data_format", y_data_format});
-  attrs.insert({"use_quantizer", use_quantizer});
   attrs.insert({"mkldnn_data_type", mkldnn_data_type});
-  attrs.insert({"Scale_x", Scale_x});
-  attrs.insert({"Scale_y", Scale_y});
-  attrs.insert({"Scale_out", Scale_out});
 
   auto op = framework::OpRegistry::CreateOp(
       "elementwise_add_grad",
diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h
index 0f7f6d8b21c238..ca481045c06569 100644
--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -112,8 +112,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
         {"Scale_y", ExtraAttrProperty::ONEDNN},
         {"Scale_out", ExtraAttrProperty::ONEDNN},
         {"Scale_weights", ExtraAttrProperty::ONEDNN},
-        {"x_data_format", ExtraAttrProperty::ONEDNN},
-        {"y_data_format", ExtraAttrProperty::ONEDNN},
         {"fused_squeeze2_axes", ExtraAttrProperty::ONEDNN},
         {"fused_unsqueeze2_axes", ExtraAttrProperty::ONEDNN},
         {"fused_reshape2_shape", ExtraAttrProperty::ONEDNN},
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index 13131bd345bd1a..0ce2ac2c090baf 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -41,8 +41,7 @@
 - op : add (elementwise_add)
   backward : add_grad (elementwise_add_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : addmm
   backward : addmm_grad
@@ -452,8 +451,7 @@
   outputs :
     out: Out
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : dot
   inputs :
@@ -505,8 +503,7 @@
 - op : elementwise_pow
   backward : elementwise_pow_grad
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : elu
   backward : elu_grad, elu_double_grad (elu_grad_grad)
@@ -667,20 +664,17 @@
 
 - op : floor_divide (elementwise_floordiv)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : fmax (elementwise_fmax)
   backward : fmax_grad (elementwise_fmax_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : fmin (elementwise_fmin)
   backward : fmin_grad (elementwise_fmin_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : fold
   inputs :
@@ -746,8 +740,7 @@
 
 - op : grad_add
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : greater_equal
   inputs :
@@ -824,8 +817,7 @@
 - op : heaviside (elementwise_heaviside)
   backward : heaviside_grad (elementwise_heaviside_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : histogram
   inputs :
@@ -1109,14 +1101,12 @@
 - op : maximum (elementwise_max)
   backward : maximum_grad (elementwise_max_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : maximum (elementwise_min)
   backward : maximum_grad (elementwise_min_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : maxout
   inputs :
@@ -1188,8 +1178,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : mv
   inputs :
@@ -1386,8 +1375,7 @@
 
 - op : remainder (elementwise_mod)
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : renorm
   backward : renorm_grad
@@ -1684,8 +1672,7 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
-             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
 
 - op : sum (reduce_sum)
   backward : (sum_grad) reduce_sum_grad

From 462f006961dedde51e4d76558720cc11a462c87d Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Thu, 9 Mar 2023 14:36:50 +0100
Subject: [PATCH 04/23] adjust quantization attributes

---
 .../fluid/framework/ir/mkldnn/cpu_quantize_pass.cc   | 12 ++++++------
 .../ir/mkldnn/cpu_quantize_placement_pass.cc         |  3 +++
 paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h  |  2 +-
 .../operators/compat/fused_elementwise_mul.pbtxt     |  4 ++++
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index b25b17125363f4..310ba0b81f49e8 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -1045,14 +1045,14 @@ void CPUQuantizePass::QuantizeElementwise(
                   "X",
                   input_x_scale,
                   is_x_unsigned,
-                  "Scale_x");
+                  "scale_x");
     QuantizeInput(g,
                   elementwise_op,
                   elementwise_y,
                   "Y",
                   input_y_scale,
                   is_y_unsigned,
-                  "Scale_y");
+                  "scale_y");
 
     bool is_output_unsigned{false};
     auto output_scale =
@@ -1064,7 +1064,7 @@ void CPUQuantizePass::QuantizeElementwise(
                      "Out",
                      output_scale,
                      is_output_unsigned,
-                     "Scale_out");
+                     "scale_out");
 
     ++quantize_elementwise_count;
   };
@@ -1310,9 +1310,9 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeImmutable(graph, "nearest_interp", "X");
   QuantizeImmutable(graph, "nearest_interp_v2", "X");
   QuantizeImmutable(graph, "split", "X");
-  QuantizeElementwise(graph, "elementwise_add");
-  QuantizeElementwise(graph, "elementwise_mul");
-  QuantizeElementwise(graph, "elementwise_sub");
+  QuantizeElementwise(graph, "fused_elementwise_add");
+  QuantizeElementwise(graph, "fused_elementwise_mul");
+  QuantizeElementwise(graph, "fused_elementwise_sub");
   QuantizeFusionGru(graph);
   QuantizeMultiGru(graph);
   QuantizeFusionLSTM(graph);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 2f6b5e4c14ecc8..c8f40e900e757a 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -32,6 +32,9 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
                                        "fused_conv2d",
                                        "fused_conv3d",
                                        "fused_matmul",
+                                       "fused_elementwise_add",
+                                       "fused_elementwise_mul",
+                                       "fused_elementwise_sub",
                                        "elementwise_add",
                                        "elementwise_mul",
                                        "elementwise_sub",
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index f946948428a4a2..b008b4cccc39a7 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -164,7 +164,7 @@ inline void ConvertToFusedOp(OpDesc* op) {
       {"elementwise_add", "fused_elementwise_add"},
       {"elementwise_sub", "fused_elementwise_sub"},
       {"elementwise_mul", "fused_elementwise_mul"},
-      {"elementwise_mul", "fused_elementwise_div"}};
+      {"elementwise_div", "fused_elementwise_div"}};
 
   if (op->Type() == "matmul") {
     op->SetAttr("trans_x", op->GetAttr("transpose_X"));
diff --git a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
index 14b099ab9f8fcb..9c46e8b7c7e643 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
@@ -31,6 +31,10 @@ extra {
     name: "fused_output_scale"
     type: FLOAT
   }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS  
+  }
   attrs {
     name: "scale_x"
     type: FLOAT

From b72bd52dff574bdf2a0485fcd12298e3b382cba1 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Thu, 9 Mar 2023 14:57:48 +0100
Subject: [PATCH 05/23] add missing arguments and fix others, review fixed

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +-
 ...cc => elementwise_act_onednn_fuse_pass.cc} |   6 +-
 ...s.h => elementwise_act_onednn_fuse_pass.h} |   0
 .../inference/api/paddle_pass_builder.cc      |   2 +-
 .../operators/compat/elementwise_add.pbtxt    |  14 +-
 .../operators/compat/elementwise_div.pbtxt    |  14 +-
 .../operators/compat/elementwise_mul.pbtxt    |  14 +-
 .../operators/compat/elementwise_sub.pbtxt    |  14 +-
 .../compat/fused_elementwise_add.pbtxt        |  28 ++--
 .../compat/fused_elementwise_div.pbtxt        |  28 ++--
 .../compat/fused_elementwise_mul.pbtxt        |  28 ++--
 .../compat/fused_elementwise_sub.pbtxt        |  29 ++--
 .../elementwise/elementwise_sub_op.cc         |   4 +-
 .../operators/fused/fused_elementwise_op.h    |  27 ++--
 .../fusion/onednn/fused_elementwise_kernel.cc |  12 +-
 .../phi/ops/compat/fused_elementwise_sig.cc   |  90 ++++++-----
 ... test_onednn_elementwise_act_fuse_pass.py} | 152 +++++++++---------
 ...nn_elementwise_add_activation_fuse_pass.py |   4 +-
 18 files changed, 221 insertions(+), 247 deletions(-)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_onednn_fuse_pass.cc => elementwise_act_onednn_fuse_pass.cc} (95%)
 rename paddle/fluid/framework/ir/mkldnn/{elt_act_onednn_fuse_pass.h => elementwise_act_onednn_fuse_pass.h} (100%)
 rename python/paddle/fluid/tests/unittests/ir/inference/{test_mkldnn_elt_act_fuse_pass.py => test_onednn_elementwise_act_fuse_pass.py} (64%)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 13266b12a3b9ff..bbc7a861264a35 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -175,7 +175,7 @@ if(WITH_MKLDNN)
   pass_library(softplus_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(shuffle_channel_mkldnn_detect_pass inference DIR mkldnn)
   pass_library(fc_act_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(elt_act_onednn_fuse_pass inference DIR mkldnn)
+  pass_library(elementwise_act_onednn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
similarity index 95%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
rename to paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
index 9999ee3e09a3d0..b6e84145aebffd 100644
--- a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h"
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h"
@@ -85,9 +85,9 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 }  // namespace framework
 }  // namespace paddle
 
-REGISTER_PASS(elt_act_onednn_fuse_pass,
+REGISTER_PASS(elementwise_act_onednn_fuse_pass,
               paddle::framework::ir::ElementwiseActivationOneDNNPass);
-REGISTER_PASS_CAPABILITY(elt_act_onednn_fuse_pass)
+REGISTER_PASS_CAPABILITY(elementwise_act_onednn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("elementwise_add", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h
similarity index 100%
rename from paddle/fluid/framework/ir/mkldnn/elt_act_onednn_fuse_pass.h
rename to paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.h
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 562c3502ce6d32..79c3a4503af516 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -377,7 +377,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              "batch_norm_act_fuse_pass",              //
              "softplus_activation_mkldnn_fuse_pass",  //
              "shuffle_channel_mkldnn_detect_pass",    //
-             "elt_act_onednn_fuse_pass",              //
+             "elementwise_act_onednn_fuse_pass",      //
              "layer_norm_onednn_optimization_pass",   //
              "operator_scale_onednn_fuse_pass",       //
              "operator_unsqueeze2_onednn_fuse_pass",  //
diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt
index 5b55f3981c77da..5691bcc8fa9e1c 100644
--- a/paddle/fluid/operators/compat/elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
@@ -22,25 +22,17 @@ extra {
   attrs {
     name: "Out0_threshold"
     type: FLOAT
-  }  
-  attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
   }
   attrs {
-    name: "Scale_x"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_out"
     type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt
index a73d2072029a3a..48bc2adcc26377 100644
--- a/paddle/fluid/operators/compat/elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
@@ -16,23 +16,15 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_out"
     type: FLOAT
   }
   attrs {
diff --git a/paddle/fluid/operators/compat/elementwise_mul.pbtxt b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
index 22289e2689c10a..a24491430b2aa1 100644
--- a/paddle/fluid/operators/compat/elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
@@ -16,23 +16,15 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_out"
     type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
index 9f38601f585ea8..a7aca8ca86973a 100644
--- a/paddle/fluid/operators/compat/elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
@@ -16,23 +16,15 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "Scale_x"
+    name: "scale_x"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "scale_out"
     type: FLOAT
   }
   attrs {
diff --git a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
index 3514e06839bdc1..375970171dd57d 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
@@ -16,35 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
+    name: "scale_x"
+    type: FLOAT
   }
   attrs {
-    name: "alpha"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "scale_out"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
-    type: FLOAT
+    name: "fuse_activation"
+    type: STRING
   }
   attrs {
-    name: "Scale_y"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "fuse_beta"
     type: FLOAT
-  }
+  }    
   attrs {
-    name: "fuse_activation"
-    type: STRING
+    name: "fused_output_scale"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
index 0690a2983e9688..9aefd0c58ca582 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
@@ -16,35 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
+    name: "scale_x"
+    type: FLOAT
   }
   attrs {
-    name: "alpha"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "scale_out"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
-    type: FLOAT
+    name: "fuse_activation"
+    type: STRING
   }
   attrs {
-    name: "Scale_y"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "fuse_beta"
     type: FLOAT
-  }
+  }    
   attrs {
-    name: "fuse_activation"
-    type: STRING
+    name: "fused_output_scale"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
index 5a1a27d38fc0fd..44d3cff55bd86e 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
@@ -16,35 +16,35 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
+    name: "scale_x"
+    type: FLOAT
   }
   attrs {
-    name: "y_data_format"
-    type: STRING
+    name: "scale_y"
+    type: FLOAT
   }
   attrs {
-    name: "alpha"
+    name: "scale_out"
     type: FLOAT
   }
   attrs {
-    name: "beta"
-    type: FLOAT
-  }  
+    name: "fuse_activation"
+    type: STRING
+  }
   attrs {
-    name: "Scale_x"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "Scale_y"
+    name: "fuse_beta"
     type: FLOAT
-  }
+  }    
   attrs {
-    name: "Scale_out"
+    name: "fused_output_scale"
     type: FLOAT
   }
   attrs {
-    name: "fuse_activation"
-    type: STRING
+    name: "fused_unsqueeze2_axes"
+    type: INTS
   }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
index c12b3ca5f3a41b..db540aa9fb1cb7 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
@@ -16,36 +16,31 @@ def {
 }
 extra {
   attrs {
-    name: "x_data_format"
-    type: STRING
-  }
-  attrs {
-    name: "y_data_format"
-    type: STRING
+    name: "scale_x"
+    type: FLOAT
   }
   attrs {
-    name: "alpha"
+    name: "scale_y"
     type: FLOAT
   }
   attrs {
-    name: "beta"
+    name: "scale_out"
     type: FLOAT
-  }  
+  }
   attrs {
-    name: "Scale_x"
-    type: FLOAT
+    name: "fuse_activation"
+    type: STRING
   }
   attrs {
-    name: "Scale_y"
+    name: "fuse_alpha"
     type: FLOAT
   }
   attrs {
-    name: "Scale_out"
+    name: "fuse_beta"
     type: FLOAT
-  }
+  }  
   attrs {
-    name: "fuse_activation"
-    type: STRING
-  }
+    name: "fused_output_scale"
+    type: FLOAT
   }
 }
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index 6088ac3d01945c..73eab8b878755f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -132,9 +132,9 @@ REGISTER_OPERATOR(elementwise_sub_grad_grad,
 
 REGISTER_OP_VERSION(elementwise_sub)
     .AddCheckpoint(
-        R"ROC(Register elementwise_sub for adding the attribute of Scale_y)ROC",
+        R"ROC(Register elementwise_sub for adding the attribute of scale_y)ROC",
         paddle::framework::compatible::OpVersionDesc().NewAttr(
-            "Scale_y",
+            "scale_y",
             "In order to support the function of scaling the input Y when "
             "using the operator of elementwise_sub.",
             1.0f));
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.h b/paddle/fluid/operators/fused/fused_elementwise_op.h
index ca1e368fd06625..45c7d89f2fb3aa 100644
--- a/paddle/fluid/operators/fused/fused_elementwise_op.h
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.h
@@ -33,21 +33,30 @@ class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
                  "for broadcasting Y onto X. ")
         .SetDefault(-1);
 
-    AddAttr<float>("Alfa", "Alfa value for the elementwise operator")
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from elementwise_act_onednn_fuse_pass")
+        .SetDefault("");
+    AddAttr<float>("fuse_alpha", "Alfa value for the elementwise operator")
         .SetDefault(1.0f);
-    AddAttr<float>("Beta", "Beta value for the elementwise operator")
+    AddAttr<float>("fuse_beta", "Beta value for the elementwise operator")
         .SetDefault(1.0f);
 
-    AddAttr<std::string>("fuse_activation",
-                         "Activation type from elt_act_onednn_fuse_pass")
-        .SetDefault("");
-
-    AddAttr<float>("Scale_x", "Obtained from cpu_quantize_pass")
+    AddAttr<float>("scale_x", "Obtained from cpu_quantize_pass")
         .SetDefault(1.0f);
-    AddAttr<float>("Scale_y", "Obtained from cpu_quantize_pass")
+    AddAttr<float>("scale_y", "Obtained from cpu_quantize_pass")
         .SetDefault(1.0f);
-    AddAttr<float>("Scale_out", "Obtained from cpu_quantize_pass")
+    AddAttr<float>("scale_out", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+
+    AddAttr<float>("fused_output_scale",
+                   "Obtained from operator_scale_onednn_fuse_pass")
         .SetDefault(1.0f);
+    AddAttr<std::vector<int>>(
+        "fused_unsqueeze2_axes",
+        "Obtained from operator_unsqueeze2_onednn_fuse_pass for "
+        "elementwise_mul")
+        .SetDefault({});
     AddOpComment();
   }
 
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index b10b3902b0bdba..1f1a6b7be76fc8 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -25,15 +25,15 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
                             DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  float scale_x = dev_ctx.HasDnnAttr("Scale_x")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
+  float scale_x = dev_ctx.HasDnnAttr("scale_x")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_x"))
                       : 1.0f;
-  float scale_y = dev_ctx.HasDnnAttr("Scale_y")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
+  float scale_y = dev_ctx.HasDnnAttr("scale_y")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_y"))
                       : 1.0f;
   float scale_out =
-      dev_ctx.HasDnnAttr("Scale_out")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
+      dev_ctx.HasDnnAttr("scale_out")
+          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_out"))
           : 1.0f;
 
   dnnl::post_ops post_operations;
diff --git a/paddle/phi/ops/compat/fused_elementwise_sig.cc b/paddle/phi/ops/compat/fused_elementwise_sig.cc
index 4b1b9599ab32d7..f7ed8474ef44b3 100644
--- a/paddle/phi/ops/compat/fused_elementwise_sig.cc
+++ b/paddle/phi/ops/compat/fused_elementwise_sig.cc
@@ -22,25 +22,27 @@ KernelSignature FusedElementwiseAddOpArgumentMapping(
   if (axis == -1) {
     return KernelSignature("fused_add",
                            {"X", "Y"},
-                           {"alpha",
-                            "beta",
+                           {"fuse_alpha",
+                            "fuse_beta",
                             "fuse_activation",
                             "mkldnn_data_type",
-                            "Scale_x",
-                            "Scale_y",
-                            "Scale_out"},
+                            "scale_x",
+                            "scale_y",
+                            "scale_out",
+                            "fused_output_scale"},
                            {"Out"});
   }
   return KernelSignature("fused_add_raw",
                          {"X", "Y"},
                          {"axis",
-                          "alpha",
-                          "beta",
+                          "fuse_alpha",
+                          "fuse_beta",
                           "fuse_activation",
                           "mkldnn_data_type",
-                          "Scale_x",
-                          "Scale_y",
-                          "Scale_out"},
+                          "scale_x",
+                          "scale_y",
+                          "scale_out",
+                          "fused_output_scale"},
                          {"Out"});
 }
 
@@ -50,25 +52,27 @@ KernelSignature FusedElementwiseSubOpArgumentMapping(
   if (axis == -1) {
     return KernelSignature("fused_subtract",
                            {"X", "Y"},
-                           {"alpha",
-                            "beta",
+                           {"fuse_alpha",
+                            "fuse_beta",
                             "fuse_activation",
                             "mkldnn_data_type",
-                            "Scale_x",
-                            "Scale_y",
-                            "Scale_out"},
+                            "scale_x",
+                            "scale_y",
+                            "scale_out",
+                            "fused_output_scale"},
                            {"Out"});
   }
   return KernelSignature("fused_subtract_raw",
                          {"X", "Y"},
                          {"axis",
-                          "alpha",
-                          "beta",
+                          "fuse_alpha",
+                          "fuse_beta",
                           "fuse_activation",
                           "mkldnn_data_type",
-                          "Scale_x",
-                          "Scale_y",
-                          "Scale_out"},
+                          "scale_x",
+                          "scale_y",
+                          "scale_out",
+                          "fused_output_scale"},
                          {"Out"});
 }
 
@@ -78,25 +82,29 @@ KernelSignature FusedElementwiseMulOpArgumentMapping(
   if (axis == -1) {
     return KernelSignature("fused_multiply",
                            {"X", "Y"},
-                           {"alpha",
-                            "beta",
+                           {"fuse_alpha",
+                            "fuse_beta",
                             "fuse_activation",
                             "mkldnn_data_type",
-                            "Scale_x",
-                            "Scale_y",
-                            "Scale_out"},
+                            "scale_x",
+                            "scale_y",
+                            "scale_out",
+                            "fused_output_scale",
+                            "fused_unsqueeze2_axes"},
                            {"Out"});
   }
   return KernelSignature("fused_multiply_raw",
                          {"X", "Y"},
                          {"axis",
-                          "alpha",
-                          "beta",
+                          "fuse_alpha",
+                          "fuse_beta",
                           "fuse_activation",
                           "mkldnn_data_type",
-                          "Scale_x",
-                          "Scale_y",
-                          "Scale_out"},
+                          "scale_x",
+                          "scale_y",
+                          "scale_out",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes"},
                          {"Out"});
 }
 
@@ -106,25 +114,27 @@ KernelSignature FusedElementwiseDivOpArgumentMapping(
   if (axis == -1) {
     return KernelSignature("fused_divide",
                            {"X", "Y"},
-                           {"alpha",
-                            "beta",
+                           {"fuse_alpha",
+                            "fuse_beta",
                             "fuse_activation",
                             "mkldnn_data_type",
-                            "Scale_x",
-                            "Scale_y",
-                            "Scale_out"},
+                            "scale_x",
+                            "scale_y",
+                            "scale_out",
+                            "fused_output_scale"},
                            {"Out"});
   }
   return KernelSignature("fused_divide_raw",
                          {"X", "Y"},
                          {"axis",
-                          "alpha",
-                          "beta",
+                          "fuse_fuse_alpha",
+                          "fuse_beta",
                           "fuse_activation",
                           "mkldnn_data_type",
-                          "Scale_x",
-                          "Scale_y",
-                          "Scale_out"},
+                          "scale_x",
+                          "scale_y",
+                          "scale_out",
+                          "fused_output_scale"},
                          {"Out"});
 }
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_act_fuse_pass.py
similarity index 64%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_act_fuse_pass.py
index 3315534821dc02..0572a87b8a6531 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_elt_act_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_act_fuse_pass.py
@@ -23,10 +23,10 @@
 from paddle.fluid.core import PassVersionChecker
 
 
-class ElementwiseActivationMkldnnFusePassTest(InferencePassTest):
+class ElementwiseActivationOneDNNFusePassTest(InferencePassTest):
     act_alpha = None
     act_beta = None
-    pass_name = 'elt_act_onednn_fuse_pass'
+    pass_name = 'elementwise_act_onednn_fuse_pass'
 
     def setUp(self):
         self.set_params()
@@ -65,24 +65,24 @@ def test_pass_compatible(self):
         self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Relu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Relu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = F.relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.tanh
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_LeakyRelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_LeakyRelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
@@ -90,40 +90,40 @@ def set_params(self):
         self.act = paddle.nn.functional.leaky_relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Swish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Swish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.nn.functional.swish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_HardSwish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_HardSwish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.nn.functional.hardswish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_SQRT(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_SQRT(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.sqrt
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_ABS(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_ABS(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.abs
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Clip(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Clip(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
@@ -132,16 +132,16 @@ def set_params(self):
         self.act_beta = 10.0
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Gelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Gelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.nn.functional.gelu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Gelu_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Gelu_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
@@ -149,40 +149,40 @@ def set_params(self):
         self.act_alpha = True
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Relu6(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Relu6(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.nn.functional.relu6
 
 
-class ElementwiseActivationMkldnnFusePassTest_Add_Sigmoid(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Add_Sigmoid(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
         self.act = paddle.nn.functional.sigmoid
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Relu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Relu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = F.relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.tanh
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_LeakyRelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_LeakyRelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
@@ -190,32 +190,32 @@ def set_params(self):
         self.act = paddle.nn.functional.leaky_relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Swish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Swish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.nn.functional.swish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_HardSwish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_HardSwish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.nn.functional.hardswish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_ABS(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_ABS(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.abs
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Clip(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Clip(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
@@ -224,16 +224,16 @@ def set_params(self):
         self.act_beta = 10.0
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Gelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.nn.functional.gelu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Gelu_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Gelu_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
@@ -241,40 +241,40 @@ def set_params(self):
         self.act_alpha = True
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Relu6(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Relu6(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.nn.functional.relu6
 
 
-class ElementwiseActivationMkldnnFusePassTest_Sub_Sigmoid(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Sub_Sigmoid(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
         self.act = paddle.nn.functional.sigmoid
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Relu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Relu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = F.relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.tanh
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_LeakyRelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_LeakyRelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
@@ -282,40 +282,40 @@ def set_params(self):
         self.act = paddle.nn.functional.leaky_relu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Swish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Swish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.nn.functional.swish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_HardSwish(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_HardSwish(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.nn.functional.hardswish
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_SQRT(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_SQRT(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.sqrt
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_ABS(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_ABS(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.abs
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Clip(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Clip(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
@@ -324,16 +324,16 @@ def set_params(self):
         self.act_beta = 10.0
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Gelu(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.nn.functional.gelu
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Gelu_Tanh(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Gelu_Tanh(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
@@ -341,16 +341,16 @@ def set_params(self):
         self.act_alpha = True
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Relu6(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Relu6(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
         self.act = paddle.nn.functional.relu6
 
 
-class ElementwiseActivationMkldnnFusePassTest_Mul_Sigmoid(
-    ElementwiseActivationMkldnnFusePassTest
+class ElementwiseActivationOneDNNFusePassTest_Mul_Sigmoid(
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
@@ -358,7 +358,7 @@ def set_params(self):
 
 
 class ElementwiseScaleOneDNNFusePassTest_Add(
-    ElementwiseActivationMkldnnFusePassTest
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.add
@@ -367,7 +367,7 @@ def set_params(self):
 
 
 class ElementwiseScaleOneDNNFusePassTest_Sub(
-    ElementwiseActivationMkldnnFusePassTest
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.subtract
@@ -376,7 +376,7 @@ def set_params(self):
 
 
 class ElementwiseScaleOneDNNFusePassTest_Mul(
-    ElementwiseActivationMkldnnFusePassTest
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.multiply
@@ -385,7 +385,7 @@ def set_params(self):
 
 
 class ElementwiseScaleOneDNNFusePassTest_Div(
-    ElementwiseActivationMkldnnFusePassTest
+    ElementwiseActivationOneDNNFusePassTest
 ):
     def set_params(self):
         self.operand = paddle.divide
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
index 92341facaaf217..e4a4809971739a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_elementwise_add_activation_fuse_pass.py
@@ -118,7 +118,7 @@ def sample_predictor_configs(self, program_config):
         config = self.create_inference_config(
             use_mkldnn=True,
             passes=[
-                'elt_act_onednn_fuse_pass',
+                'elementwise_act_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )
@@ -128,7 +128,7 @@ def test(self):
         self.run_and_statis(
             quant=False,
             passes=[
-                'elt_act_onednn_fuse_pass',
+                'elementwise_act_onednn_fuse_pass',
                 'operator_scale_onednn_fuse_pass',
             ],
         )

From 190e7976bf5f32f2600eccec8c37e5fd39263634 Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Thu, 9 Mar 2023 16:58:58 +0100
Subject: [PATCH 06/23] simplify fused kernel registration

---
 .../elementwise_act_onednn_fuse_pass.cc       |  54 +++----
 .../operators/fused/fused_elementwise_op.h    |  18 ++-
 .../fusion/onednn/fused_elementwise_kernel.cc | 136 ++++++++----------
 .../phi/ops/compat/fused_elementwise_sig.cc   | 105 +++-----------
 4 files changed, 108 insertions(+), 205 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
index 8df33719676f18..b6e84145aebffd 100644
--- a/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/elementwise_act_onednn_fuse_pass.cc
@@ -85,36 +85,24 @@ void ElementwiseActivationOneDNNPass::FuseElementwiseAct(
 }  // namespace framework
 }  // namespace paddle
 
-< < < < < < < <
-    HEAD : paddle / fluid / framework / ir / mkldnn /
-           elementwise_activation_onednn_fuse_pass.cc REGISTER_PASS(
-               elementwise_activation_onednn_fuse_pass,
-               paddle::framework::ir::ElementwiseActivationOneDNNPass);
-REGISTER_PASS_CAPABILITY(elementwise_activation_onednn_fuse_pass)
-== == == ==
-    REGISTER_PASS(elementwise_act_onednn_fuse_pass,
-                  paddle::framework::ir::ElementwiseActivationOneDNNPass);
-REGISTER_PASS_CAPABILITY(elementwise_act_onednn_fuse_pass) >>>>>>>>
-    jhulek /
-        Fused_Elementwise_Kernel_And_Op
-    : paddle
-      /
-      fluid / framework / ir / mkldnn /
-      elementwise_act_onednn_fuse_pass.cc.AddCombination(
-          paddle::framework::compatible::OpVersionComparatorCombination()
-              .LE("elementwise_add", 1)
-              .LE("elementwise_sub", 1)
-              .LE("elementwise_mul", 1)
-              .EQ("abs", 0)
-              .LE("clip", 1)
-              .EQ("gelu", 0)
-              .EQ("hard_sigmoid", 0)
-              .LE("hard_swish", 0)
-              .LE("leaky_relu", 1)
-              .LE("mish", 1)
-              .EQ("relu", 0)
-              .EQ("relu6", 0)
-              .EQ("sigmoid", 0)
-              .EQ("sqrt", 0)
-              .EQ("swish", 0)
-              .EQ("tanh", 0));
+REGISTER_PASS(elementwise_act_onednn_fuse_pass,
+              paddle::framework::ir::ElementwiseActivationOneDNNPass);
+REGISTER_PASS_CAPABILITY(elementwise_act_onednn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .LE("elementwise_sub", 1)
+            .LE("elementwise_mul", 1)
+            .EQ("abs", 0)
+            .LE("clip", 1)
+            .EQ("gelu", 0)
+            .EQ("hard_sigmoid", 0)
+            .LE("hard_swish", 0)
+            .LE("leaky_relu", 1)
+            .LE("mish", 1)
+            .EQ("relu", 0)
+            .EQ("relu6", 0)
+            .EQ("sigmoid", 0)
+            .EQ("sqrt", 0)
+            .EQ("swish", 0)
+            .EQ("tanh", 0));
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.h b/paddle/fluid/operators/fused/fused_elementwise_op.h
index 0a9010bd14fbd1..21bd80d1b7d1f4 100644
--- a/paddle/fluid/operators/fused/fused_elementwise_op.h
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.h
@@ -37,17 +37,9 @@ class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
         "Activation type from elementwise_act_onednn_fuse_pass")
         .SetDefault("");
     AddAttr<float>("fuse_alpha", "Alfa value for the elementwise operator")
-        .SetDefault(1.0f);
+        .SetDefault(0.0f);
     AddAttr<float>("fuse_beta", "Beta value for the elementwise operator")
-        .SetDefault(1.0f);
-
-    AddAttr<float>("scale_x", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_y", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_out", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-
+        .SetDefault(0.0f);
     AddAttr<float>("fused_output_scale",
                    "Obtained from operator_scale_onednn_fuse_pass")
         .SetDefault(1.0f);
@@ -56,6 +48,12 @@ class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
         "Obtained from operator_unsqueeze2_onednn_fuse_pass for "
         "elementwise_mul")
         .SetDefault({});
+    AddAttr<float>("scale_x", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_y", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_out", "Obtained from cpu_quantize_pass")
+        .SetDefault(1.0f);
     AddOpComment();
   }
 
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index 1f1a6b7be76fc8..185b9797e7ec7f 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,28 +21,24 @@ template <typename T, dnnl::algorithm BINARY_OP>
 void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
                             const DenseTensor& x,
                             const DenseTensor& y,
-                            int axis,
+                            const int axis,
+                            const std::string& fuse_activation,
+                            const float fuse_alpha,
+                            const float fuse_beta,
+                            const float fused_output_scale,
+                            const std::vector<int>& fused_unsqueeze2_axes,
+                            const float scale_x,
+                            const float scale_y,
+                            const float scale_out,
                             DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  float scale_x = dev_ctx.HasDnnAttr("scale_x")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_x"))
-                      : 1.0f;
-  float scale_y = dev_ctx.HasDnnAttr("scale_y")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_y"))
-                      : 1.0f;
-  float scale_out =
-      dev_ctx.HasDnnAttr("scale_out")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("scale_out"))
-          : 1.0f;
-
   dnnl::post_ops post_operations;
-  funcs::AppendActivation(dev_ctx, post_operations);
-  if (dev_ctx.HasDnnAttr("fused_output_scale")) {
-    float scale_alpha =
-        PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale"));
+  funcs::AppendActivation(
+      dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+  if (fused_output_scale != 1.0) {
     post_operations.append_eltwise(
-        1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+        1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
   }
 
   auto* non_const_x = &x;
@@ -108,33 +104,51 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
   binary_prim->execute(astream, args);
   astream.wait();
 
-  if (handler.use_broadcasting_hack == false) {
-    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
-        dev_ctx, out, dst_memory->get_desc());
-  } else {
-    auto dims = dst_memory->get_desc().dims();
+  auto out_md = dst_memory->get_desc();
+
+  if (handler.use_broadcasting_hack) {
+    auto dims = out_md.dims();
     dims.insert(dims.begin(), non_const_x->dims()[0]);
     dims[1] /= dims[0];
-    funcs::SetOutMemDescWithLogicalLayoutFusesSupport(
-        dev_ctx, out, dst_memory->get_desc().reshape(dims));
+    out_md = out_md.reshape(dims);
+  }
+
+  if (fused_unsqueeze2_axes.empty()) {
+    out->set_mem_desc(out_md);
+  } else {
+    funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
+        fused_unsqueeze2_axes, out, out_md);
   }
 }
 
-#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)           \
-  template <typename T, typename Context>                           \
-  void name##RawKernel(const Context& dev_ctx,                      \
-                       const DenseTensor& x,                        \
-                       const DenseTensor& y,                        \
-                       int axis,                                    \
-                       DenseTensor* out) {                          \
-    FusedElementwiseKernel<T, algorithm>(dev_ctx, x, y, axis, out); \
-  }                                                                 \
-  template <typename T, typename Context>                           \
-  void name##Kernel(const Context& dev_ctx,                         \
-                    const DenseTensor& x,                           \
-                    const DenseTensor& y,                           \
-                    DenseTensor* out) {                             \
-    FusedElementwiseKernel<T, algorithm>(dev_ctx, x, y, -1, out);   \
+#define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)          \
+  template <typename T, typename Context>                          \
+  void name##Kernel(const Context& dev_ctx,                        \
+                    const DenseTensor& x,                          \
+                    const DenseTensor& y,                          \
+                    const int axis,                                \
+                    const std::string& fuse_activation,            \
+                    const float fuse_alpha,                        \
+                    const float fuse_beta,                         \
+                    const float fused_output_scale,                \
+                    const std::vector<int>& fused_unsqueeze2_axes, \
+                    const float scale_x,                           \
+                    const float scale_y,                           \
+                    const float scale_out,                         \
+                    DenseTensor* out) {                            \
+    FusedElementwiseKernel<T, algorithm>(dev_ctx,                  \
+                                         x,                        \
+                                         y,                        \
+                                         axis,                     \
+                                         fuse_activation,          \
+                                         fuse_alpha,               \
+                                         fuse_beta,                \
+                                         fused_output_scale,       \
+                                         fused_unsqueeze2_axes,    \
+                                         scale_x,                  \
+                                         scale_y,                  \
+                                         scale_out,                \
+                                         out);                     \
   }
 
 DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedAdd, dnnl::algorithm::binary_add)
@@ -144,16 +158,7 @@ DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(fused_add_raw,
-                   OneDNN,
-                   ONEDNN,
-                   phi::FusedAddRawKernel,
-                   float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
-
-PD_REGISTER_KERNEL(fused_add,
+PD_REGISTER_KERNEL(fused_elementwise_add,
                    OneDNN,
                    ONEDNN,
                    phi::FusedAddKernel,
@@ -162,16 +167,7 @@ PD_REGISTER_KERNEL(fused_add,
                    int8_t,
                    uint8_t) {}
 
-PD_REGISTER_KERNEL(fused_subtract_raw,
-                   OneDNN,
-                   ONEDNN,
-                   phi::FusedSubtractRawKernel,
-                   float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
-
-PD_REGISTER_KERNEL(fused_subtract,
+PD_REGISTER_KERNEL(fused_elementwise_sub,
                    OneDNN,
                    ONEDNN,
                    phi::FusedSubtractKernel,
@@ -180,16 +176,7 @@ PD_REGISTER_KERNEL(fused_subtract,
                    int8_t,
                    uint8_t) {}
 
-PD_REGISTER_KERNEL(fused_multiply_raw,
-                   OneDNN,
-                   ONEDNN,
-                   phi::FusedMultiplyRawKernel,
-                   float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
-
-PD_REGISTER_KERNEL(fused_multiply,
+PD_REGISTER_KERNEL(fused_elementwise_mul,
                    OneDNN,
                    ONEDNN,
                    phi::FusedMultiplyKernel,
@@ -198,14 +185,7 @@ PD_REGISTER_KERNEL(fused_multiply,
                    int8_t,
                    uint8_t) {}
 
-PD_REGISTER_KERNEL(fused_divide_raw,
-                   OneDNN,
-                   ONEDNN,
-                   phi::FusedDivideRawKernel,
-                   float,
-                   phi::dtype::bfloat16) {}
-
-PD_REGISTER_KERNEL(fused_divide,
+PD_REGISTER_KERNEL(fused_elementwise_div,
                    OneDNN,
                    ONEDNN,
                    phi::FusedDivideKernel,
diff --git a/paddle/phi/ops/compat/fused_elementwise_sig.cc b/paddle/phi/ops/compat/fused_elementwise_sig.cc
index f7ed8474ef44b3..5494546de9804f 100644
--- a/paddle/phi/ops/compat/fused_elementwise_sig.cc
+++ b/paddle/phi/ops/compat/fused_elementwise_sig.cc
@@ -18,133 +18,70 @@ namespace phi {
 
 KernelSignature FusedElementwiseAddOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
-  if (axis == -1) {
-    return KernelSignature("fused_add",
-                           {"X", "Y"},
-                           {"fuse_alpha",
-                            "fuse_beta",
-                            "fuse_activation",
-                            "mkldnn_data_type",
-                            "scale_x",
-                            "scale_y",
-                            "scale_out",
-                            "fused_output_scale"},
-                           {"Out"});
-  }
-  return KernelSignature("fused_add_raw",
+  return KernelSignature("fused_elementwise_add",
                          {"X", "Y"},
                          {"axis",
+                          "fuse_activation",
                           "fuse_alpha",
                           "fuse_beta",
-                          "fuse_activation",
-                          "mkldnn_data_type",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
                           "scale_x",
                           "scale_y",
-                          "scale_out",
-                          "fused_output_scale"},
+                          "scale_out"},
                          {"Out"});
 }
 
 KernelSignature FusedElementwiseSubOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
-  if (axis == -1) {
-    return KernelSignature("fused_subtract",
-                           {"X", "Y"},
-                           {"fuse_alpha",
-                            "fuse_beta",
-                            "fuse_activation",
-                            "mkldnn_data_type",
-                            "scale_x",
-                            "scale_y",
-                            "scale_out",
-                            "fused_output_scale"},
-                           {"Out"});
-  }
-  return KernelSignature("fused_subtract_raw",
+  return KernelSignature("fused_elementwise_sub",
                          {"X", "Y"},
                          {"axis",
+                          "fuse_activation",
                           "fuse_alpha",
                           "fuse_beta",
-                          "fuse_activation",
-                          "mkldnn_data_type",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
                           "scale_x",
                           "scale_y",
-                          "scale_out",
-                          "fused_output_scale"},
+                          "scale_out"},
                          {"Out"});
 }
 
 KernelSignature FusedElementwiseMulOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
-  if (axis == -1) {
-    return KernelSignature("fused_multiply",
-                           {"X", "Y"},
-                           {"fuse_alpha",
-                            "fuse_beta",
-                            "fuse_activation",
-                            "mkldnn_data_type",
-                            "scale_x",
-                            "scale_y",
-                            "scale_out",
-                            "fused_output_scale",
-                            "fused_unsqueeze2_axes"},
-                           {"Out"});
-  }
-  return KernelSignature("fused_multiply_raw",
+  return KernelSignature("fused_elementwise_mul",
                          {"X", "Y"},
                          {"axis",
+                          "fuse_activation",
                           "fuse_alpha",
                           "fuse_beta",
-                          "fuse_activation",
-                          "mkldnn_data_type",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
                           "scale_x",
                           "scale_y",
-                          "scale_out",
-                          "fused_output_scale",
-                          "fused_unsqueeze2_axes"},
+                          "scale_out"},
                          {"Out"});
 }
 
 KernelSignature FusedElementwiseDivOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
-  int axis = paddle::any_cast<int>(ctx.Attr("axis"));
-  if (axis == -1) {
-    return KernelSignature("fused_divide",
-                           {"X", "Y"},
-                           {"fuse_alpha",
-                            "fuse_beta",
-                            "fuse_activation",
-                            "mkldnn_data_type",
-                            "scale_x",
-                            "scale_y",
-                            "scale_out",
-                            "fused_output_scale"},
-                           {"Out"});
-  }
-  return KernelSignature("fused_divide_raw",
+  return KernelSignature("fused_elementwise_div",
                          {"X", "Y"},
                          {"axis",
-                          "fuse_fuse_alpha",
-                          "fuse_beta",
                           "fuse_activation",
-                          "mkldnn_data_type",
+                          "fuse_alpha",
+                          "fuse_beta",
+                          "fused_output_scale",
+                          "fused_unsqueeze2_axes",
                           "scale_x",
                           "scale_y",
-                          "scale_out",
-                          "fused_output_scale"},
+                          "scale_out"},
                          {"Out"});
 }
 
 }  // namespace phi
 
-PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_add, fused_add);
-PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_sub, fused_subtract);
-PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_mul, fused_multiply);
-PD_REGISTER_BASE_KERNEL_NAME(fused_elementwise_div, fused_divide);
-
 PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_add,
                            phi::FusedElementwiseAddOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(fused_elementwise_sub,

From 4f729433733e7ab38c85b51dffdfee1c879da21d Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Thu, 9 Mar 2023 17:10:13 +0100
Subject: [PATCH 07/23] fix elementwise unit tests

---
 .../ir/mkldnn/operator_scale_onednn_fuse_pass.cc    | 13 +++++++++----
 .../mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc  |  5 +++--
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
index 6c060e125372fd..086de37ab42996 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
@@ -31,6 +31,10 @@ void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
       "fused_matmul",
       "matmul",
       "matmul_v2",
+      "fused_elementwise_add",
+      "fused_elementwise_sub",
+      "fused_elementwise_mul",
+      "fused_elementwise_div",
       "elementwise_add",
       "elementwise_sub",
       "elementwise_mul",
@@ -87,10 +91,7 @@ void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
       scale = *(scale_tensor->data<float>());
     }
 
-    if (op_type == "matmul" || op_type == "matmul_v2") {
-      ConvertToFusedOp(operator_op->Op());
-    }
-
+    ConvertToFusedOp(operator_op->Op());
     operator_op->Op()->SetAttr("fused_output_scale", scale);
     operator_op->Op()->SetOutput("Out", {scale_out->Name()});
 
@@ -120,6 +121,10 @@ REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
             .EQ("fused_matmul", 0)
             .LE("matmul", 1)
             .EQ("matmul_v2", 0)
+            .EQ("fused_elementwise_add", 0)
+            .EQ("fused_elementwise_sub", 0)
+            .EQ("fused_elementwise_mul", 0)
+            .EQ("fused_elementwise_div", 0)
             .LE("elementwise_add", 1)
             .LE("elementwise_sub", 1)
             .LE("elementwise_mul", 1)
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
index 716d14886327a0..0d908bd358ece6 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -14,8 +14,8 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h"
 
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/backends/onednn/onednn_reuse.h"
 #include "paddle/utils/string/pretty_log.h"
 
 namespace paddle {
@@ -26,7 +26,7 @@ using string::PrettyLogDetail;
 
 void FuseOperatorUnsqueeze2OneDNNPass::ApplyImpl(Graph *graph) const {
   std::vector<std::pair<std::string, int>> ops_and_outputs = {
-      {"transpose2", 2}, {"elementwise_mul", 1}};
+      {"transpose2", 2}, {"fused_elementwise_mul", 1}, {"elementwise_mul", 1}};
 
   for (const auto &op_and_outputs : ops_and_outputs)
     FuseUnsqueeze2(graph, op_and_outputs.first, op_and_outputs.second);
@@ -55,6 +55,7 @@ void FuseOperatorUnsqueeze2OneDNNPass::FuseUnsqueeze2(
     GET_IR_NODE_FROM_SUBGRAPH(
         unsqueeze2_out, unsqueeze2_out, op_unsqueeze2_pattern);
 
+    ConvertToFusedOp(operator_op->Op());
     if (!operator_op->Op()->HasAttr("use_mkldnn") ||
         (operator_op->Op()->HasAttr("use_mkldnn") &&
          !(PADDLE_GET_CONST(bool, operator_op->Op()->GetAttr("use_mkldnn"))))) {

From 1ad3f16a23d5b3e9dbc6c3db2de9aa94885ef50a Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Fri, 10 Mar 2023 13:29:22 +0100
Subject: [PATCH 08/23] reuse one fused elementwise op

---
 .../fused/fused_elementwise_add_op.cc         |  56 --------
 .../fused/fused_elementwise_div_op.cc         |  56 --------
 .../fused/fused_elementwise_mul_op.cc         |  56 --------
 .../operators/fused/fused_elementwise_op.cc   |  95 +++++++++++++
 .../operators/fused/fused_elementwise_op.h    | 125 ------------------
 .../fused/fused_elementwise_sub_op.cc         |  56 --------
 6 files changed, 95 insertions(+), 349 deletions(-)
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_add_op.cc
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_div_op.cc
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
 create mode 100644 paddle/fluid/operators/fused/fused_elementwise_op.cc
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_op.h
 delete mode 100644 paddle/fluid/operators/fused/fused_elementwise_sub_op.cc

diff --git a/paddle/fluid/operators/fused/fused_elementwise_add_op.cc b/paddle/fluid/operators/fused/fused_elementwise_add_op.cc
deleted file mode 100644
index e8b689a9eb1d37..00000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_add_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseAddOpMaker : public FusedElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Add"; }
-  std::string GetEquation() const override { return "Out = X + Y"; }
-
-  void AddInputX() override {
-    AddInput(
-        "X",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  void AddInputY() override {
-    AddInput(
-        "Y",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  std::string GetOpFuntionality() const override {
-    return "Add two tensors element-wise";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fused_elementwise_add,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseAddOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_div_op.cc b/paddle/fluid/operators/fused/fused_elementwise_div_op.cc
deleted file mode 100644
index 3f36ab85e73718..00000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_div_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseDivOpMaker : public FusedElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Div"; }
-  std::string GetEquation() const override { return "Out = X / Y"; }
-
-  void AddInputX() override {
-    AddInput(
-        "X",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  void AddInputY() override {
-    AddInput(
-        "Y",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  std::string GetOpFuntionality() const override {
-    return "Divide two tensors element-wise";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fused_elementwise_div,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseDivOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc b/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
deleted file mode 100644
index 92a73e1df56d14..00000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_mul_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseMulOpMaker : public FusedElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Mul"; }
-  std::string GetEquation() const override { return "Out = X \\\\odot Y"; }
-
-  void AddInputX() override {
-    AddInput(
-        "X",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  void AddInputY() override {
-    AddInput(
-        "Y",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  std::string GetOpFuntionality() const override {
-    return "Multiply two tensors element-wise";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fused_elementwise_mul,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseMulOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.cc b/paddle/fluid/operators/fused/fused_elementwise_op.cc
new file mode 100644
index 00000000000000..e6c2743e9385d2
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_elementwise_op.cc
@@ -0,0 +1,95 @@
+//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() final {
+    AddInput("X", "The first input tensor of elementwise op.");
+    AddInput("Y", "The second input tensor of elementwise op.");
+    AddOutput("Out", "A location into which the result is stored.");
+    AddAttr<int>(
+        "axis",
+        "If X.dimension != Y.dimension, Y.dimension must be a "
+        "subsequence of X.dimension. And axis is the start dimension index "
+        "for broadcasting Y onto X.")
+        .SetDefault(-1);
+    AddAttr<std::string>(
+        "fuse_activation",
+        "Activation type from elementwise_act_onednn_fuse_pass")
+        .SetDefault("");
+    AddAttr<float>("fuse_alpha",
+                   "Activation alpha from elementwise_act_onednn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fuse_beta",
+                   "Activation beta from elementwise_act_onednn_fuse_pass")
+        .SetDefault(0.0f);
+    AddAttr<float>("fused_output_scale",
+                   "Obtained from operator_scale_onednn_fuse_pass")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<int>>(
+        "fused_unsqueeze2_axes",
+        "Obtained from operator_unsqueeze2_onednn_fuse_pass")
+        .SetDefault({});
+    AddAttr<float>("scale_x", "Elementwise X input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_y", "Elementwise Y input quantization scale")
+        .SetDefault(1.0f);
+    AddAttr<float>("scale_out", "Elementwise Out output quantization scale")
+        .SetDefault(1.0f);
+    AddComment(
+        R"DOC(Elementwise operator extended with oneDNN-specific fusion logic.)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_elementwise_add,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    fused_elementwise_sub,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    fused_elementwise_mul,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(
+    fused_elementwise_div,
+    ops::ElementwiseOp,
+    ops::FusedElementwiseOpMaker,
+    ops::ElementwiseOpInferVarType,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_elementwise_op.h b/paddle/fluid/operators/fused/fused_elementwise_op.h
deleted file mode 100644
index 21bd80d1b7d1f4..00000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_op.h
+++ /dev/null
@@ -1,125 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() final {
-    AddInputX();
-    AddInputY();
-    AddOpOutput();
-    AddAttr<int>("axis",
-                 "(int, default -1). If X.dimension != Y.dimension,"
-                 "Y.dimension must be a subsequence of x.dimension. And axis "
-                 "is the start dimension index "
-                 "for broadcasting Y onto X. ")
-        .SetDefault(-1);
-    AddAttr<std::string>(
-        "fuse_activation",
-        "Activation type from elementwise_act_onednn_fuse_pass")
-        .SetDefault("");
-    AddAttr<float>("fuse_alpha", "Alfa value for the elementwise operator")
-        .SetDefault(0.0f);
-    AddAttr<float>("fuse_beta", "Beta value for the elementwise operator")
-        .SetDefault(0.0f);
-    AddAttr<float>("fused_output_scale",
-                   "Obtained from operator_scale_onednn_fuse_pass")
-        .SetDefault(1.0f);
-    AddAttr<std::vector<int>>(
-        "fused_unsqueeze2_axes",
-        "Obtained from operator_unsqueeze2_onednn_fuse_pass for "
-        "elementwise_mul")
-        .SetDefault({});
-    AddAttr<float>("scale_x", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_y", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-    AddAttr<float>("scale_out", "Obtained from cpu_quantize_pass")
-        .SetDefault(1.0f);
-    AddOpComment();
-  }
-
- protected:
-  virtual void AddInputX() {
-    AddInput("X", "(Tensor), The first input tensor of elementwise op.");
-  }
-  virtual void AddInputY() {
-    AddInput("Y", "(Tensor), The second input tensor of elementwise op.");
-  }
-  virtual void AddOpOutput() {
-    AddOutput("Out",
-              "N-dimension tensor. A location into which the result is stored. "
-              "It's dimension "
-              "equals with x");
-  }
-  virtual void AddOpComment() { AddComment(GetCommentExamples()); }
-
-  virtual std::string GetOpFuntionality() const { return ""; }
-
-  virtual std::string GetName() const = 0;
-  virtual std::string GetEquation() const = 0;
-
-  std::string GetCommentExamples() const {
-    return string::Sprintf(R"DOC(
-Elementwise %s Operator.
-
-%s
-
-The equation is:
-
-$$%s$$
-
-- $X$: a tensor of any dimension.
-- $Y$: a tensor whose dimensions must be less than or equal to the dimensions of $X$.
-
-There are two cases for this operator:
-
-1. The shape of $Y$ is the same with $X$.
-2. The shape of $Y$ is a continuous subsequence of $X$.
-
-For case 2:
-
-1. Broadcast $Y$ to match the shape of $X$, where $axis$ is the start dimension index
-   for broadcasting $Y$ onto $X$.
-2. If $axis$ is -1 (default), $axis = rank(X) - rank(Y)$.
-3. The trailing dimensions of size 1 for $Y$ will be ignored for the consideration of
-   subsequence, such as shape(Y) = (2, 1) => (2).
-
-For example:
-
-  .. code-block:: text
-
-    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5), with axis=-1(default) or axis=2
-    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
-    shape(X) = (2, 3, 4, 5), shape(Y) = (2, 1), with axis=0
-
-)DOC",
-                           GetName(),
-                           GetOpFuntionality(),
-                           GetEquation());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc b/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc
deleted file mode 100644
index 0b345134f663f7..00000000000000
--- a/paddle/fluid/operators/fused/fused_elementwise_sub_op.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-//   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/operators/fused/fused_elementwise_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FusedElementwiseSubOpMaker : public FusedElementwiseOpMaker {
- protected:
-  std::string GetName() const override { return "Sub"; }
-  std::string GetEquation() const override { return "Out = X - Y"; }
-
-  void AddInputX() override {
-    AddInput(
-        "X",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  void AddInputY() override {
-    AddInput(
-        "Y",
-        "(Variable), Tensor or phi::DenseTensor of any dimensions. Its dtype "
-        "should be int32, int64, float32, float64.");
-  }
-
-  std::string GetOpFuntionality() const override {
-    return "Subtract two tensors element-wise";
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-namespace ops = paddle::operators;
-
-REGISTER_OPERATOR(
-    fused_elementwise_sub,
-    ops::ElementwiseOp,
-    ops::FusedElementwiseSubOpMaker,
-    ops::ElementwiseOpInferVarType,
-    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
-    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

From f93b1a7e469e4f24745312104fba92c7de504b48 Mon Sep 17 00:00:00 2001
From: Silv3S <slawomir.siwek@intel.com>
Date: Fri, 10 Mar 2023 14:10:09 +0100
Subject: [PATCH 09/23] adjust proto

---
 .../compat/fused_elementwise_add.pbtxt        |  4 +++
 .../compat/fused_elementwise_div.pbtxt        | 28 +++++++++++++++++++
 .../compat/fused_elementwise_mul.pbtxt        |  2 +-
 .../compat/fused_elementwise_sub.pbtxt        |  4 +++
 .../fusion/onednn/fused_elementwise_kernel.cc |  7 ++---
 5 files changed, 40 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
index 374170aa4e97fe..6c589b7244ca47 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_add.pbtxt
@@ -31,6 +31,10 @@ extra {
     name: "fused_output_scale"
     type: FLOAT
   }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS
+  }
   attrs {
     name: "scale_x"
     type: FLOAT
diff --git a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
index 1eb6230dfc36d0..136edf14d3ef6a 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_div.pbtxt
@@ -15,8 +15,36 @@ def {
   }
 }
 extra {
+  attrs {
+    name: "fuse_activation"
+    type: STRING
+  }
+  attrs {
+    name: "fuse_alpha"
+    type: FLOAT
+  }
+  attrs {
+    name: "fuse_beta"
+    type: FLOAT
+  }
   attrs {
     name: "fused_output_scale"
     type: FLOAT
   }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS
+  }
+  attrs {
+    name: "scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
index 9c46e8b7c7e643..05e70950d3a212 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_mul.pbtxt
@@ -33,7 +33,7 @@ extra {
   }
   attrs {
     name: "fused_unsqueeze2_axes"
-    type: INTS  
+    type: INTS
   }
   attrs {
     name: "scale_x"
diff --git a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
index 1f59922cff1556..b9d4acad8cbc75 100644
--- a/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/fused_elementwise_sub.pbtxt
@@ -31,6 +31,10 @@ extra {
     name: "fused_output_scale"
     type: FLOAT
   }
+  attrs {
+    name: "fused_unsqueeze2_axes"
+    type: INTS
+  }
   attrs {
     name: "scale_x"
     type: FLOAT
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index 185b9797e7ec7f..e06c5860a6718c 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -65,15 +65,14 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
 
   const auto src_x_memory = handler.AcquireSrcMemory(non_const_x);
   const auto src_y_memory = handler.AcquireSecondSrcMemory(non_const_y);
-  // (jczaja) For Inplace src and dst should be the same memory object.
+  // For Inplace src and dst should be the same memory object.
   // So x should share buffer with z. But UT mechanics is testing inplace
   // execution for this op not checking that x can be bradcasted to match in
   // shape y tensor.
   // This is wrong as when x is to be broadcasted then z(out) will match the
   // shape of y which is bigger than x. Hence if x is smaller in shape than z
-  // and they share a buffer (of
-  // shape x) then this buffer is not big enough to hold result of elementwise
-  // operation.
+  // and they share a buffer (of shape x) then this buffer is not big enough
+  // to hold result of elementwise operation.
   const bool reuse_x_memory = non_const_x->numel() == out->numel() &&
                               non_const_x->IsSharedBufferWith(*out);
   std::shared_ptr<dnnl::memory> dst_memory;

From 079fb837631476daa2dfabea7eb06c294c73b39d Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Mon, 13 Mar 2023 13:53:49 +0100
Subject: [PATCH 10/23] Add supported datatypes

---
 .../fusion/onednn/fused_elementwise_kernel.cc | 35 +++++++++++++------
 1 file changed, 25 insertions(+), 10 deletions(-)

diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index e06c5860a6718c..81ccd12bda078c 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -162,31 +162,46 @@ PD_REGISTER_KERNEL(fused_elementwise_add,
                    ONEDNN,
                    phi::FusedAddKernel,
                    float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_sub,
                    OneDNN,
                    ONEDNN,
                    phi::FusedSubtractKernel,
                    float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_mul,
                    OneDNN,
                    ONEDNN,
                    phi::FusedMultiplyKernel,
                    float,
-                   phi::dtype::bfloat16,
-                   int8_t,
-                   uint8_t) {}
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_div,
                    OneDNN,
                    ONEDNN,
                    phi::FusedDivideKernel,
                    float,
-                   phi::dtype::bfloat16) {}
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}

From 3651b8eadc659ea879fe9f733f7268eb7345642f Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Tue, 14 Mar 2023 17:37:32 +0100
Subject: [PATCH 11/23] Change 'Scale' to 'scale' in tests, change some tests
 to onednn

---
 .../framework/ir/graph_pattern_detector.cc    |  8 ++-
 .../ir/mkldnn/cpu_quantize_pass_tester.cc     | 34 ++++++------
 .../test_trt_convert_multihead_matmul.py      |  6 +--
 ...p.py => test_elementwise_add_onednn_op.py} | 28 +++++-----
 ...p.py => test_elementwise_mul_onednn_op.py} | 22 ++++----
 ...p.py => test_elementwise_sub_onednn_op.py} | 28 +++++-----
 python/paddle/incubate/autograd/primrules.py  | 54 ++-----------------
 tools/parallel_UT_rule.py                     | 12 ++---
 tools/static_mode_white_list.py               |  6 +--
 9 files changed, 79 insertions(+), 119 deletions(-)
 rename python/paddle/fluid/tests/unittests/mkldnn/{test_elementwise_add_mkldnn_op.py => test_elementwise_add_onednn_op.py} (87%)
 rename python/paddle/fluid/tests/unittests/mkldnn/{test_elementwise_mul_mkldnn_op.py => test_elementwise_mul_onednn_op.py} (89%)
 rename python/paddle/fluid/tests/unittests/mkldnn/{test_elementwise_sub_mkldnn_op.py => test_elementwise_sub_onednn_op.py} (91%)

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 858725f601079c..92c84ff50ef9cf 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2254,7 +2254,8 @@ PDNode *patterns::OpRequant::operator()() {
   auto any_op = pattern->NewNode(any_op_repr())
                     ->assert_is_op()
                     ->assert_more([&](Node *node) {
-                      return node->Op()->HasAttr("Scale_out") ? true : false;
+                      return (node->Op()->HasAttr("Scale_out") ||
+                              node->Op()->HasAttr("scale_out"));
                     });
   auto requant_in = pattern->NewNode(requant_in_repr())
                         ->assert_is_op_input("requantize", "Input");
@@ -2282,7 +2283,10 @@ PDNode *patterns::RequantOp::operator()() {
                     ->assert_more([&](Node *node) {
                       return (node->Op()->HasAttr("Scale_in") ||
                               node->Op()->HasAttr("Scale_x") ||
-                              node->Op()->HasAttr("Scale_y"));
+                              node->Op()->HasAttr("Scale_y") ||
+                              node->Op()->HasAttr("scale_in") ||
+                              node->Op()->HasAttr("scale_x") ||
+                              node->Op()->HasAttr("scale_y"));
                     });
 
   requant_op->LinksFrom({requant_in}).LinksTo({requant_out});
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 195aa2728acd57..9c32e564c3722b 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -96,14 +96,15 @@ void SetOp(ProgramDesc* prog,
     op->SetAttr("Scale_x", 1.0f);
     op->SetAttr("Scale_y", 1.0f);
     op->SetAttr("Scale_out", 1.0f);
-  } else if (type == "elementwise_add" || type == "elementwise_mul" ||
-             type == "elementwise_sub") {
+  } else if (type == "fused_elementwise_add" ||
+             type == "fused_elementwise_sub" ||
+             type == "fused_elementwise_mul") {
     op->SetInput("X", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Y", {inputs[1]});
     op->SetOutput("Out", {outputs[0]});
-    op->SetAttr("Scale_x", 1.0f);
-    op->SetAttr("Scale_y", 1.0f);
-    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("scale_x", 1.0f);
+    op->SetAttr("scale_y", 1.0f);
+    op->SetAttr("scale_out", 1.0f);
   } else if (type == "fusion_gru") {
     op->SetInput("X", {inputs[0]});
     op->SetInput("Bias", {inputs[1]});
@@ -177,16 +178,19 @@ void CheckScales(const OpDesc* op, float scale, float shift) {
               scale);
     scale_names.push_back("Scale_in");
     scale_names.push_back("Scale_out");
-  } else if (type == "fused_matmul" || type == "elementwise_add" ||
-             type == "elementwise_mul" || type == "elementwise_sub") {
+  } else if (type == "fused_matmul") {
     scale_names.push_back("Scale_x");
     scale_names.push_back("Scale_y");
     scale_names.push_back("Scale_out");
-    if (type == "fused_matmul") {
-      auto const& names = op->InputNames();
-      if (std::find(names.begin(), names.end(), "ResidualData") != names.end())
-        scale_names.push_back("Scale_in_eltwise");
-    }
+    auto const& names = op->InputNames();
+    if (std::find(names.begin(), names.end(), "ResidualData") != names.end())
+      scale_names.push_back("Scale_in_eltwise");
+  } else if (type == "fused_elementwise_add" ||
+             type == "fused_elementwise_sub" ||
+             type == "fused_elementwise_mul") {
+    scale_names.push_back("scale_x");
+    scale_names.push_back("scale_y");
+    scale_names.push_back("scale_out");
   } else if (type == "fusion_gru" || type == "fusion_lstm") {
     EXPECT_EQ(op->GetAttrIfExists<float>("Shift_data"), shift);
     EXPECT_EQ(op->GetAttrIfExists<std::vector<float>>("Scale_weights")[0],
@@ -709,9 +713,9 @@ void TestElementwiseUnsignedAndSignedInput(
 }
 
 const std::vector<std::vector<std::string>> elementwises = {
-    {"elementwise_add", "ElementwiseAdd"},
-    {"elementwise_mul", "ElementwiseMul"},
-    {"elementwise_sub", "ElementwiseSub"}};
+    {"fused_elementwise_add", "FusedElementwiseAdd"},
+    {"fused_elementwise_mul", "FusedElementwiseMul"},
+    {"fused_elementwise_sub", "FusedElementwiseSub"}};
 
 class TestElementwises
     : public testing::TestWithParam<std::vector<std::string>> {};
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index cca7fcddcdabbf..f5658c172a890a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -834,9 +834,9 @@ def generate_weight2():
                         },
                         "op_outputs": {"Out": ["elementwise_add1_output"]},
                         "op_attrs": {
-                            "Scale_out": 1.0,
-                            "Scale_x": 1.0,
-                            "Scale_y": 1.0,
+                            "scale_out": 1.0,
+                            "scale_x": 1.0,
+                            "scale_y": 1.0,
                             "axis": 2,
                             "Out": 1.0,
                         },
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
similarity index 87%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
index a50289de1eebfe..95b2f0f7b4ed75 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
@@ -23,7 +23,7 @@
 )
 
 
-class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
+class TestOneDNNElementwiseAddOp(TestElementwiseAddOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -31,21 +31,21 @@ def init_dtype(self):
         self.dtype = np.float32
 
 
-class TestMKLDNNElementwiseAddOp2(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOp2(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOp3(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOp3(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOp4(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -59,21 +59,21 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOp5(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOpBroadcastXintoY(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
+class TestOneDNNElementwiseAddOp_broadcast_3(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -83,7 +83,7 @@ def init_axis(self):
         self.axis = 1
 
 
-class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestMKLDNNElementwiseAddOp):
+class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -123,9 +123,9 @@ def init_input_output(self):
         self.out = np.add(self.x, self.y)
 
     def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
+        self.attrs['scale_x'] = 1.0
+        self.attrs['scale_y'] = 1.0
+        self.attrs['scale_out'] = 1.0
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -159,9 +159,9 @@ def init_input_output(self):
         self.scale_o, self.out = self.quantize(self.out_f)
 
     def init_scales(self):
-        self.attrs['Scale_x'] = self.scale_x
-        self.attrs['Scale_y'] = self.scale_y
-        self.attrs['Scale_out'] = self.scale_o
+        self.attrs['scale_x'] = self.scale_x
+        self.attrs['scale_y'] = self.scale_y
+        self.attrs['scale_out'] = self.scale_o
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
similarity index 89%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
index a558b05196dcf1..9d3afeaa40925f 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
@@ -23,7 +23,7 @@
 )
 
 
-class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
+class TestOneDNNElementwiseMulOp(ElementwiseMulOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -31,21 +31,21 @@ def init_dtype(self):
         self.dtype = np.float32
 
 
-class TestMKLDNNElementwiseMulOp2(TestMKLDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOp2(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
         self.out = np.multiply(self.x, self.y)
 
 
-class TestMKLDNNElementwiseMulOp3(TestMKLDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOp3(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.out = np.multiply(self.x, self.y)
 
 
-class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOp4(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
@@ -59,7 +59,7 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOp5(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
@@ -96,9 +96,9 @@ def init_input_output(self):
         self.out = np.multiply(self.x, self.y)
 
     def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
+        self.attrs['scale_x'] = 1.0
+        self.attrs['scale_y'] = 1.0
+        self.attrs['scale_out'] = 1.0
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
@@ -132,9 +132,9 @@ def init_input_output(self):
         self.scale_o, self.out = self.quantize(self.out_f)
 
     def init_scales(self):
-        self.attrs['Scale_x'] = self.scale_x
-        self.attrs['Scale_y'] = self.scale_y
-        self.attrs['Scale_out'] = self.scale_o
+        self.attrs['scale_x'] = self.scale_x
+        self.attrs['scale_y'] = self.scale_y
+        self.attrs['scale_out'] = self.scale_o
 
     def test_check_output(self):
         # TODO(wangzhongpu): support mkldnn op in dygraph mode
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
similarity index 91%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
rename to python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
index 685e8cc648a5e3..5f97197a683ece 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
@@ -30,7 +30,7 @@
     not (isinstance(_current_expected_place(), core.CPUPlace)),
     "GPU is not supported",
 )
-class TestMKLDNNElementwiseSubOp(OpTest):
+class TestOneDNNElementwiseSubOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.init_dtype()
@@ -71,49 +71,49 @@ def test_check_output(self):
         self.check_output()
 
 
-class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp2(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp3(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp4(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp5(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
         self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp6(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp6(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp7(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp7(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.uniform(0.1, 2, [1, 180]).astype(self.dtype)
         self.y = np.random.uniform(0.1, 1, [256, 1]).astype(self.dtype)
         self.out = np.subtract(self.x, self.y)
 
 
-class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOp_broadcast(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
         self.y = np.random.rand(10, 12).astype(self.dtype)
@@ -123,7 +123,7 @@ def init_axis(self):
         self.axis = 1
 
 
-class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
+class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.rand(10, 12).astype(self.dtype)
         self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
@@ -134,7 +134,7 @@ def init_axis(self):
 
 
 @OpTestTool.skip_if_not_cpu_bf16()
-class TestBf16(TestMKLDNNElementwiseSubOp):
+class TestBf16(TestOneDNNElementwiseSubOp):
     def setUp(self):
         self.op_type = "elementwise_sub"
         self.init_dtype()
@@ -223,7 +223,7 @@ def test_check_grad_ignore_x(self):
         )
 
 
-class TestInt8(TestMKLDNNElementwiseSubOp):
+class TestInt8(TestOneDNNElementwiseSubOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
@@ -237,9 +237,9 @@ def init_input_output(self):
         self.out = np.subtract(self.x, self.y)
 
     def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
+        self.attrs['scale_x'] = 1.0
+        self.attrs['scale_y'] = 1.0
+        self.attrs['scale_out'] = 1.0
 
     def test_check_output(self):
         self.init_scales()
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
index 137747e75da15e..06dad16b9bf5bc 100644
--- a/python/paddle/incubate/autograd/primrules.py
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -132,69 +132,21 @@ def linear_jvp(op, *args, **kwargs):
 def elementwise_add_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
-    if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x')
-        )
-        x = mul(x, scale_x)
-    if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y')
-        )
-        y = mul(y, scale_y)
-    z = add(x, y)
-    if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out')
-        )
-        z = mul(z, scale_out)
-    return z
+    return add(x, y)
 
 
 @REGISTER_ORIG2PRIM('elementwise_sub')
 def elementwise_sub_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
-    if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x')
-        )
-        x = mul(x, scale_x)
-    if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y')
-        )
-        y = mul(y, scale_y)
-    z = sub(x, y)
-    if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out')
-        )
-        z = mul(z, scale_out)
-    return z
+    return sub(x, y)
 
 
 @REGISTER_ORIG2PRIM('elementwise_mul')
 def elementwise_mul_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
-    if op.attr('Scale_x') - 1.0 > 1e-5:
-        scale_x = fill_const(
-            shape=x.shape, dtype=x.dtype, value=op.attr('Scale_x')
-        )
-        x = mul(x, scale_x)
-    if op.attr('Scale_y') - 1.0 > 1e-5:
-        scale_y = fill_const(
-            shape=y.shape, dtype=y.dtype, value=op.attr('Scale_y')
-        )
-        y = mul(y, scale_y)
-    z = mul(x, y)
-    if op.attr('Scale_out') - 1.0 > 1e-5:
-        scale_out = fill_const(
-            shape=z.shape, dtype=z.dtype, value=op.attr('Scale_out')
-        )
-        z = mul(z, scale_out)
-    return z
+    return mul(x, y)
 
 
 @REGISTER_ORIG2PRIM('elementwise_div')
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 90f6f3600e3fdd..05a6ebe4e3e7d7 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -370,7 +370,7 @@
     'test_adaptive_pool2d_convert_global_pass',
     'test_lookup_table_v2_bf16_op',
     'test_operator_desc',
-    'test_elementwise_mul_mkldnn_op',
+    'test_elementwise_mul_onednn_op',
     'test_fetch_handler',
     'test_cpu_bfloat16_placement_pass',
     'test_match_matrix_tensor_op',
@@ -649,7 +649,7 @@
     'test_auto_parallel_reshard_serial',
     'test_auto_parallel_reshard_dpmppp',
     'test_clip_mkldnn_op',
-    'test_elementwise_sub_mkldnn_op',
+    'test_elementwise_sub_onednn_op',
     'test_flatten_mkldnn_op',
     'test_slice_mkldnn_op',
     'test_ir_generate_pass',
@@ -717,7 +717,7 @@
     'test_quant2_int8_resnet50_channelwise_mkldnn',
     'test_analyzer_bert',
     'test_directory_migration',
-    'test_elementwise_add_mkldnn_op',
+    'test_elementwise_add_onednn_op',
     'test_quant_int8_googlenet_mkldnn',
     'test_callback_early_stop',
 ]
@@ -2259,7 +2259,7 @@
     'test_scale_bf16_mkldnn_op',
     'test_ir_generate_pass',
     'test_expand_v2_mkldnn_op',
-    'test_elementwise_sub_mkldnn_op',
+    'test_elementwise_sub_onednn_op',
 ]
 
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
@@ -2702,7 +2702,7 @@
     'test_fusion_transpose_flatten_concat_op',
     'test_elementwise_nn_grad',
     'test_hinge_loss_op',
-    'test_elementwise_add_mkldnn_op',
+    'test_elementwise_add_onednn_op',
     'test_optimizer',
     'test_deformable_conv_op',
     'test_py_reader_push_pop',
@@ -2861,7 +2861,7 @@
     'test_queue',
     'test_cross_entropy_op',
     'test_detection',
-    'test_elementwise_mul_mkldnn_op',
+    'test_elementwise_mul_onednn_op',
     'test_grid_generator',
     'test_functional_conv2d',
     'test_fit_a_line',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 3d7e2654a4f639..f83d21c1fca076 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -594,11 +594,11 @@
     'test_conv2d_transpose_bf16_mkldnn_op',
     'test_conv3d_mkldnn_op',
     'test_dequantize_mkldnn_op',
-    'test_elementwise_add_mkldnn_op',
+    'test_elementwise_add_onednn_op',
     'test_elementwise_add_bf16_mkldnn_op',
     'test_elementwise_div_mkldnn_op',
-    'test_elementwise_sub_mkldnn_op',
-    'test_elementwise_mul_mkldnn_op',
+    'test_elementwise_sub_onednn_op',
+    'test_elementwise_mul_onednn_op',
     'test_elementwise_mul_bf16_mkldnn_op',
     'test_fc_mkldnn_op',
     'test_fc_bf16_mkldnn_op',

From f4909e8cbf50b3a8561b0ed6f148a88331666d35 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Thu, 16 Mar 2023 19:03:37 +0100
Subject: [PATCH 12/23] Revert breaking changes

---
 .../operators/compat/elementwise_add.pbtxt    | 12 ++++++
 .../operators/compat/elementwise_div.pbtxt    | 12 ++++++
 .../operators/compat/elementwise_mul.pbtxt    | 12 ++++++
 .../operators/compat/elementwise_pow.pbtxt    | 12 ++++++
 .../operators/compat/elementwise_sub.pbtxt    | 12 ++++++
 paddle/phi/api/yaml/op_compat.yaml            | 39 ++++++++++++-------
 .../fusion/onednn/fused_elementwise_kernel.cc | 37 ++++++------------
 .../phi/kernels/onednn/elementwise_kernel.cc  | 29 ++++++++++++--
 ...st_onednn_operator_unsqueeze2_fuse_pass.py |  2 +-
 tools/parallel_UT_rule.py                     | 12 +++---
 tools/static_mode_white_list.py               |  6 +--
 11 files changed, 133 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/operators/compat/elementwise_add.pbtxt b/paddle/fluid/operators/compat/elementwise_add.pbtxt
index fdafc47dcd12ac..ee2f1d7c6861e3 100644
--- a/paddle/fluid/operators/compat/elementwise_add.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_add.pbtxt
@@ -23,4 +23,16 @@ extra {
     name: "Out0_threshold"
     type: FLOAT
   }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_div.pbtxt b/paddle/fluid/operators/compat/elementwise_div.pbtxt
index 4baa9dea4fcdd5..0b46bfb7484013 100644
--- a/paddle/fluid/operators/compat/elementwise_div.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_div.pbtxt
@@ -19,4 +19,16 @@ extra {
     name: "act"
     type: STRING
   }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_mul.pbtxt b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
index 0db1c9c782ae2d..94cec09ace8f32 100644
--- a/paddle/fluid/operators/compat/elementwise_mul.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_mul.pbtxt
@@ -13,4 +13,16 @@ def {
     name: "axis"
     type: INT
   }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_pow.pbtxt b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
index 018ca70bcbca19..e9427104dbfa7c 100644
--- a/paddle/fluid/operators/compat/elementwise_pow.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_pow.pbtxt
@@ -19,4 +19,16 @@ extra {
     name: "act"
     type: STRING
   }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/fluid/operators/compat/elementwise_sub.pbtxt b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
index ae8a1352452d27..f73b92fb95f87c 100644
--- a/paddle/fluid/operators/compat/elementwise_sub.pbtxt
+++ b/paddle/fluid/operators/compat/elementwise_sub.pbtxt
@@ -19,4 +19,16 @@ extra {
     name: "act"
     type: STRING
   }
+  attrs {
+    name: "Scale_x"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_y"
+    type: FLOAT
+  }
+  attrs {
+    name: "Scale_out"
+    type: FLOAT
+  }
 }
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
index e1430844dea87b..98fbee81627c38 100644
--- a/paddle/phi/api/yaml/op_compat.yaml
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -41,7 +41,8 @@
 - op : add (elementwise_add)
   backward : add_grad (elementwise_add_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : addmm
   backward : addmm_grad
@@ -459,7 +460,8 @@
   outputs :
     out: Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : dot
   inputs :
@@ -511,7 +513,8 @@
 - op : elementwise_pow
   backward : elementwise_pow_grad
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : elu
   backward : elu_grad, elu_double_grad (elu_grad_grad)
@@ -672,17 +675,20 @@
 
 - op : floor_divide (elementwise_floordiv)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : fmax (elementwise_fmax)
   backward : fmax_grad (elementwise_fmax_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : fmin (elementwise_fmin)
   backward : fmin_grad (elementwise_fmin_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : fold
   inputs :
@@ -748,7 +754,8 @@
 
 - op : grad_add
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : greater_equal
   inputs :
@@ -825,7 +832,8 @@
 - op : heaviside (elementwise_heaviside)
   backward : heaviside_grad (elementwise_heaviside_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : histogram
   inputs :
@@ -1113,12 +1121,14 @@
 - op : maximum (elementwise_max)
   backward : maximum_grad (elementwise_max_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : maximum (elementwise_min)
   backward : maximum_grad (elementwise_min_grad)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : maxout
   inputs :
@@ -1190,7 +1200,8 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : mv
   inputs :
@@ -1391,7 +1402,8 @@
 
 - op : remainder (elementwise_mod)
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : renorm
   backward : renorm_grad
@@ -1687,7 +1699,8 @@
   outputs :
     out : Out
   extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
 
 - op : sum (reduce_sum)
   backward : (sum_grad) reduce_sum_grad
diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index 81ccd12bda078c..92e1132f9930a7 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -162,46 +162,33 @@ PD_REGISTER_KERNEL(fused_elementwise_add,
                    ONEDNN,
                    phi::FusedAddKernel,
                    float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_sub,
                    OneDNN,
                    ONEDNN,
                    phi::FusedSubtractKernel,
                    float,
-                   double,
-                   int16_t,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_mul,
                    OneDNN,
                    ONEDNN,
                    phi::FusedMultiplyKernel,
                    float,
-                   double,
-                   int,
-                   int64_t,
-                   bool,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
 
 PD_REGISTER_KERNEL(fused_elementwise_div,
                    OneDNN,
                    ONEDNN,
                    phi::FusedDivideKernel,
                    float,
-                   double,
-                   int,
-                   int64_t,
-                   phi::dtype::complex<float>,
-                   phi::dtype::complex<double>) {}
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc
index 1105a4d80b9ee6..6aa478756006f1 100644
--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -30,6 +30,26 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                        DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
+  float scale_x = dev_ctx.HasDnnAttr("Scale_x")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
+                      : 1.0f;
+  float scale_y = dev_ctx.HasDnnAttr("Scale_y")
+                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
+                      : 1.0f;
+  float scale_out =
+      dev_ctx.HasDnnAttr("Scale_out")
+          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
+          : 1.0f;
+
+  dnnl::post_ops post_operations;
+  funcs::AppendActivation(dev_ctx, post_operations);
+  if (dev_ctx.HasDnnAttr("fused_output_scale")) {
+    float scale_alpha =
+        PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale"));
+    post_operations.append_eltwise(
+        1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+  }
+
   auto* non_const_x = &x;
   auto* non_const_y = &y;
 
@@ -40,10 +60,11 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                                         non_const_x,
                                         non_const_y,
                                         out,
-                                        1.0f,
-                                        1.0f,
-                                        1.0f,
-                                        true);
+                                        scale_x,
+                                        scale_y,
+                                        scale_out,
+                                        true,
+                                        post_operations);
 
   // oneDNN's binary is optimized for broadcasting y into x, so in other case
   // we have to swap tensors to achieve optimal performance
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
index 0c92a845f06c2c..5b23c545fe402b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_onednn_operator_unsqueeze2_fuse_pass.py
@@ -143,7 +143,7 @@ def sample_predictor_configs(self, program_config):
                 "operator_unsqueeze2_onednn_fuse_pass",
             ],
         )
-        yield config, ["elementwise_mul"], (1e-5, 1e-5)
+        yield config, ["fused_elementwise_mul"], (1e-5, 1e-5)
 
     def test(self):
         self.run_and_statis(
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 05a6ebe4e3e7d7..90f6f3600e3fdd 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -370,7 +370,7 @@
     'test_adaptive_pool2d_convert_global_pass',
     'test_lookup_table_v2_bf16_op',
     'test_operator_desc',
-    'test_elementwise_mul_onednn_op',
+    'test_elementwise_mul_mkldnn_op',
     'test_fetch_handler',
     'test_cpu_bfloat16_placement_pass',
     'test_match_matrix_tensor_op',
@@ -649,7 +649,7 @@
     'test_auto_parallel_reshard_serial',
     'test_auto_parallel_reshard_dpmppp',
     'test_clip_mkldnn_op',
-    'test_elementwise_sub_onednn_op',
+    'test_elementwise_sub_mkldnn_op',
     'test_flatten_mkldnn_op',
     'test_slice_mkldnn_op',
     'test_ir_generate_pass',
@@ -717,7 +717,7 @@
     'test_quant2_int8_resnet50_channelwise_mkldnn',
     'test_analyzer_bert',
     'test_directory_migration',
-    'test_elementwise_add_onednn_op',
+    'test_elementwise_add_mkldnn_op',
     'test_quant_int8_googlenet_mkldnn',
     'test_callback_early_stop',
 ]
@@ -2259,7 +2259,7 @@
     'test_scale_bf16_mkldnn_op',
     'test_ir_generate_pass',
     'test_expand_v2_mkldnn_op',
-    'test_elementwise_sub_onednn_op',
+    'test_elementwise_sub_mkldnn_op',
 ]
 
 # It run 4 job each time, If it failed due to Insufficient GPU memory or CUBLAS_STATUS_ALLOC_FAILED,
@@ -2702,7 +2702,7 @@
     'test_fusion_transpose_flatten_concat_op',
     'test_elementwise_nn_grad',
     'test_hinge_loss_op',
-    'test_elementwise_add_onednn_op',
+    'test_elementwise_add_mkldnn_op',
     'test_optimizer',
     'test_deformable_conv_op',
     'test_py_reader_push_pop',
@@ -2861,7 +2861,7 @@
     'test_queue',
     'test_cross_entropy_op',
     'test_detection',
-    'test_elementwise_mul_onednn_op',
+    'test_elementwise_mul_mkldnn_op',
     'test_grid_generator',
     'test_functional_conv2d',
     'test_fit_a_line',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index f83d21c1fca076..3d7e2654a4f639 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -594,11 +594,11 @@
     'test_conv2d_transpose_bf16_mkldnn_op',
     'test_conv3d_mkldnn_op',
     'test_dequantize_mkldnn_op',
-    'test_elementwise_add_onednn_op',
+    'test_elementwise_add_mkldnn_op',
     'test_elementwise_add_bf16_mkldnn_op',
     'test_elementwise_div_mkldnn_op',
-    'test_elementwise_sub_onednn_op',
-    'test_elementwise_mul_onednn_op',
+    'test_elementwise_sub_mkldnn_op',
+    'test_elementwise_mul_mkldnn_op',
     'test_elementwise_mul_bf16_mkldnn_op',
     'test_fc_mkldnn_op',
     'test_fc_bf16_mkldnn_op',

From b587ed0c47b38eb227517d6852e9a327fe8bdf4a Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Fri, 17 Mar 2023 13:32:22 +0100
Subject: [PATCH 13/23] Fix unit tests

---
 .../phi/kernels/onednn/elementwise_kernel.cc  | 42 +++----------------
 .../mkldnn/test_elementwise_add_onednn_op.py  | 21 +++++++++-
 .../mkldnn/test_elementwise_mul_onednn_op.py  | 22 +++++++++-
 .../mkldnn/test_elementwise_sub_onednn_op.py  |  6 +--
 4 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/paddle/phi/kernels/onednn/elementwise_kernel.cc b/paddle/phi/kernels/onednn/elementwise_kernel.cc
index 6aa478756006f1..02b148e54f547b 100644
--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -30,26 +30,6 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                        DenseTensor* out) {
   const auto& onednn_engine = dev_ctx.GetEngine();
 
-  float scale_x = dev_ctx.HasDnnAttr("Scale_x")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_x"))
-                      : 1.0f;
-  float scale_y = dev_ctx.HasDnnAttr("Scale_y")
-                      ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_y"))
-                      : 1.0f;
-  float scale_out =
-      dev_ctx.HasDnnAttr("Scale_out")
-          ? PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("Scale_out"))
-          : 1.0f;
-
-  dnnl::post_ops post_operations;
-  funcs::AppendActivation(dev_ctx, post_operations);
-  if (dev_ctx.HasDnnAttr("fused_output_scale")) {
-    float scale_alpha =
-        PADDLE_GET_CONST(float, dev_ctx.GetDnnAttr("fused_output_scale"));
-    post_operations.append_eltwise(
-        1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
-  }
-
   auto* non_const_x = &x;
   auto* non_const_y = &y;
 
@@ -60,11 +40,10 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
                                         non_const_x,
                                         non_const_y,
                                         out,
-                                        scale_x,
-                                        scale_y,
-                                        scale_out,
-                                        true,
-                                        post_operations);
+                                        1.0f,
+                                        1.0f,
+                                        1.0f,
+                                        true);
 
   // oneDNN's binary is optimized for broadcasting y into x, so in other case
   // we have to swap tensors to achieve optimal performance
@@ -122,18 +101,7 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
     out_md = out_md.reshape(dims);
   }
 
-  const auto fused_unsqueeze2_axes =
-      dev_ctx.HasDnnAttr("fused_unsqueeze2_axes")
-          ? PADDLE_GET_CONST(std::vector<int>,
-                             dev_ctx.GetDnnAttr("fused_unsqueeze2_axes"))
-          : std::vector<int>();
-
-  if (!fused_unsqueeze2_axes.empty()) {
-    funcs::SetOutMemDescWithUnsqueeze2FuseSupport(
-        fused_unsqueeze2_axes, out, out_md);
-  } else {
-    out->set_mem_desc(out_md);
-  }
+  out->set_mem_desc(out_md);
 }
 
 #define DEFINE_ONEDNN_ELEMENTWISE_KERNEL(name, algorithm)      \
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
index 95b2f0f7b4ed75..c439feda1f77b0 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
@@ -16,8 +16,9 @@
 
 import numpy as np
 
+import paddle
 from paddle import enable_static
-from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_add_op import (
     TestElementwiseAddOp,
 )
@@ -110,6 +111,24 @@ def test_check_grad_ingore_x(self):
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
 )
 class TestInt8(TestElementwiseAddOp):
+    def setUp(self):
+        self.op_type = "fused_elementwise_add"
+        self.python_api = paddle.add
+        self.prim_op_type = "prim"
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.if_check_prim()
+        self.if_enable_cinn()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+        self.outputs = {'Out': self.out}
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
index 9d3afeaa40925f..fb85a3d2eb9595 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
@@ -16,8 +16,9 @@
 
 import numpy as np
 
+import paddle
 from paddle import enable_static
-from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import (
     ElementwiseMulOp,
 )
@@ -83,6 +84,25 @@ def test_check_grad_ingore_x(self):
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
 )
 class TestInt8(ElementwiseMulOp):
+    def setUp(self):
+        self.op_type = "fused_elementwise_mul"
+        self.prim_op_type = "prim"
+        self.python_api = paddle.multiply
+        self.dtype = np.float64
+        self.axis = -1
+        self.init_dtype()
+        self.init_input_output()
+        self.init_kernel_type()
+        self.init_axis()
+        self.if_enable_cinn()
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
+            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
+        }
+        self.outputs = {'Out': self.out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
index 5f97197a683ece..4b88cc2b2137dc 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
@@ -237,9 +237,9 @@ def init_input_output(self):
         self.out = np.subtract(self.x, self.y)
 
     def init_scales(self):
-        self.attrs['scale_x'] = 1.0
-        self.attrs['scale_y'] = 1.0
-        self.attrs['scale_out'] = 1.0
+        self.attrs['Scale_x'] = 1.0
+        self.attrs['Scale_y'] = 1.0
+        self.attrs['Scale_out'] = 1.0
 
     def test_check_output(self):
         self.init_scales()

From 43224096e3d8a2ffe250c1700708efab47f78559 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Mon, 20 Mar 2023 11:26:36 +0100
Subject: [PATCH 14/23] Delete obsolete test cases

---
 .../mkldnn/test_elementwise_add_onednn_op.py  | 63 +-------------
 .../mkldnn/test_elementwise_mul_onednn_op.py  | 86 +++++++------------
 2 files changed, 34 insertions(+), 115 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
index c439feda1f77b0..9b2d4aec1a369c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
@@ -16,9 +16,8 @@
 
 import numpy as np
 
-import paddle
 from paddle import enable_static
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_add_op import (
     TestElementwiseAddOp,
 )
@@ -111,24 +110,6 @@ def test_check_grad_ingore_x(self):
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
 )
 class TestInt8(TestElementwiseAddOp):
-    def setUp(self):
-        self.op_type = "fused_elementwise_add"
-        self.python_api = paddle.add
-        self.prim_op_type = "prim"
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-        self.if_check_prim()
-        self.if_enable_cinn()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
-        }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.out}
-
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
@@ -161,48 +142,6 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-class TestInt8Scales(TestInt8):
-    def quantize(self, tensor, dt="int8"):
-        max_int = 127.0 if dt == "int8" else 255.0
-        scale = max_int / np.abs(np.amax(tensor))
-        quantized = np.round(scale * tensor).astype(dt)
-        return scale, quantized
-
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.add(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f)
-        self.scale_y, self.y = self.quantize(self.y_f)
-        self.scale_o, self.out = self.quantize(self.out_f)
-
-    def init_scales(self):
-        self.attrs['scale_x'] = self.scale_x
-        self.attrs['scale_y'] = self.scale_y
-        self.attrs['scale_out'] = self.scale_o
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        int_atol = 1  # different quantization techniques
-        self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
-
-
-class TestUint8Scales(TestInt8Scales):
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.add(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f, "uint8")
-        self.scale_y, self.y = self.quantize(self.y_f, "uint8")
-        self.scale_o, self.out = self.quantize(self.out_f, "uint8")
-
-    def init_dtype(self):
-        self.dtype = np.uint8
-
-
 if __name__ == '__main__':
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
index fb85a3d2eb9595..fea3c8ac464848 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
@@ -16,9 +16,8 @@
 
 import numpy as np
 
-import paddle
 from paddle import enable_static
-from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
+from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
 from paddle.fluid.tests.unittests.test_elementwise_mul_op import (
     ElementwiseMulOp,
 )
@@ -84,25 +83,6 @@ def test_check_grad_ingore_x(self):
     reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
 )
 class TestInt8(ElementwiseMulOp):
-    def setUp(self):
-        self.op_type = "fused_elementwise_mul"
-        self.prim_op_type = "prim"
-        self.python_api = paddle.multiply
-        self.dtype = np.float64
-        self.axis = -1
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-        self.if_enable_cinn()
-
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
-        }
-        self.outputs = {'Out': self.out}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-
     def init_kernel_type(self):
         self.use_mkldnn = True
         self._cpu_only = True
@@ -135,46 +115,46 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-class TestInt8Scales(TestInt8):
-    def quantize(self, tensor, dt="int8"):
-        max_int = 127.0 if dt == "int8" else 255.0
-        scale = max_int / np.abs(np.amax(tensor))
-        quantized = np.round(scale * tensor).astype(dt)
-        return scale, quantized
+# class TestInt8Scales(TestInt8):
+#     def quantize(self, tensor, dt="int8"):
+#         max_int = 127.0 if dt == "int8" else 255.0
+#         scale = max_int / np.abs(np.amax(tensor))
+#         quantized = np.round(scale * tensor).astype(dt)
+#         return scale, quantized
 
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.multiply(self.x_f, self.y_f)
+#     def init_input_output(self):
+#         self.x_f = np.random.random((100,)).astype("float")
+#         self.y_f = np.random.random((100,)).astype("float")
+#         self.out_f = np.multiply(self.x_f, self.y_f)
 
-        self.scale_x, self.x = self.quantize(self.x_f)
-        self.scale_y, self.y = self.quantize(self.y_f)
-        self.scale_o, self.out = self.quantize(self.out_f)
+#         self.scale_x, self.x = self.quantize(self.x_f)
+#         self.scale_y, self.y = self.quantize(self.y_f)
+#         self.scale_o, self.out = self.quantize(self.out_f)
 
-    def init_scales(self):
-        self.attrs['scale_x'] = self.scale_x
-        self.attrs['scale_y'] = self.scale_y
-        self.attrs['scale_out'] = self.scale_o
+#     def init_scales(self):
+#         self.attrs['scale_x'] = self.scale_x
+#         self.attrs['scale_y'] = self.scale_y
+#         self.attrs['scale_out'] = self.scale_o
 
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        int_atol = 1  # different quantization techniques
-        self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
+#     def test_check_output(self):
+#         # TODO(wangzhongpu): support mkldnn op in dygraph mode
+#         self.init_scales()
+#         int_atol = 1  # different quantization techniques
+#         self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
 
 
-class TestUint8Scales(TestInt8Scales):
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.multiply(self.x_f, self.y_f)
+# class TestUint8Scales(TestInt8Scales):
+#     def init_input_output(self):
+#         self.x_f = np.random.random((100,)).astype("float")
+#         self.y_f = np.random.random((100,)).astype("float")
+#         self.out_f = np.multiply(self.x_f, self.y_f)
 
-        self.scale_x, self.x = self.quantize(self.x_f, "uint8")
-        self.scale_y, self.y = self.quantize(self.y_f, "uint8")
-        self.scale_o, self.out = self.quantize(self.out_f, "uint8")
+#         self.scale_x, self.x = self.quantize(self.x_f, "uint8")
+#         self.scale_y, self.y = self.quantize(self.y_f, "uint8")
+#         self.scale_o, self.out = self.quantize(self.out_f, "uint8")
 
-    def init_dtype(self):
-        self.dtype = np.uint8
+#     def init_dtype(self):
+#         self.dtype = np.uint8
 
 
 if __name__ == '__main__':

From 6d21d733e5d08b18949f1e5865e09ca4813c7e96 Mon Sep 17 00:00:00 2001
From: Hulek <jakub.hulek01@gmail.com>
Date: Tue, 21 Mar 2023 09:36:50 +0100
Subject: [PATCH 15/23] Delete commented out code

---
 .../mkldnn/test_elementwise_mul_onednn_op.py  | 42 -------------------
 1 file changed, 42 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
index fea3c8ac464848..b05f407dc33b46 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
@@ -115,48 +115,6 @@ def test_check_grad_ingore_y(self):
         pass
 
 
-# class TestInt8Scales(TestInt8):
-#     def quantize(self, tensor, dt="int8"):
-#         max_int = 127.0 if dt == "int8" else 255.0
-#         scale = max_int / np.abs(np.amax(tensor))
-#         quantized = np.round(scale * tensor).astype(dt)
-#         return scale, quantized
-
-#     def init_input_output(self):
-#         self.x_f = np.random.random((100,)).astype("float")
-#         self.y_f = np.random.random((100,)).astype("float")
-#         self.out_f = np.multiply(self.x_f, self.y_f)
-
-#         self.scale_x, self.x = self.quantize(self.x_f)
-#         self.scale_y, self.y = self.quantize(self.y_f)
-#         self.scale_o, self.out = self.quantize(self.out_f)
-
-#     def init_scales(self):
-#         self.attrs['scale_x'] = self.scale_x
-#         self.attrs['scale_y'] = self.scale_y
-#         self.attrs['scale_out'] = self.scale_o
-
-#     def test_check_output(self):
-#         # TODO(wangzhongpu): support mkldnn op in dygraph mode
-#         self.init_scales()
-#         int_atol = 1  # different quantization techniques
-#         self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
-
-
-# class TestUint8Scales(TestInt8Scales):
-#     def init_input_output(self):
-#         self.x_f = np.random.random((100,)).astype("float")
-#         self.y_f = np.random.random((100,)).astype("float")
-#         self.out_f = np.multiply(self.x_f, self.y_f)
-
-#         self.scale_x, self.x = self.quantize(self.x_f, "uint8")
-#         self.scale_y, self.y = self.quantize(self.y_f, "uint8")
-#         self.scale_o, self.out = self.quantize(self.out_f, "uint8")
-
-#     def init_dtype(self):
-#         self.dtype = np.uint8
-
-
 if __name__ == '__main__':
     enable_static()
     unittest.main()

From f0d836c8d5c2da624b7e689320fcd30d2882cb38 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Wed, 22 Mar 2023 18:49:24 +0100
Subject: [PATCH 16/23] Fix codestyle

---
 .../ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
index 74ac23e6a1ad07..96da2a4785a682 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -26,7 +26,10 @@ using string::PrettyLogDetail;
 
 void FuseOperatorUnsqueeze2OneDNNPass::ApplyImpl(Graph *graph) const {
   std::vector<std::pair<std::string, int>> ops_and_outputs = {
-      {"fused_transpose", 2}, {"transpose2", 2}, {"fused_elementwise_mul", 1}, {"elementwise_mul", 1}};
+      {"fused_transpose", 2},
+      {"transpose2", 2},
+      {"fused_elementwise_mul", 1},
+      {"elementwise_mul", 1}};
 
   for (const auto &op_and_outputs : ops_and_outputs)
     FuseUnsqueeze2(graph, op_and_outputs.first, op_and_outputs.second);

From 0f36edfdcfb6a747b3e09dc4d357829ddbc58e1e Mon Sep 17 00:00:00 2001
From: Hulek <jakub.hulek01@gmail.com>
Date: Mon, 27 Mar 2023 12:54:53 +0200
Subject: [PATCH 17/23] delete temporary condition

---
 .../framework/ir/mkldnn/cpu_quantize_placement_pass.cc     | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
index 43c697f44c3976..2071f284126b79 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_placement_pass.cc
@@ -90,12 +90,7 @@ void CPUQuantizePlacementPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
 
-    // Remove this condition when all fused_elementwise ops are merged
-    if (!(op->Op()->Type() == "elementwise_add" ||
-          op->Op()->Type() == "elementwise_sub" ||
-          op->Op()->Type() == "elementwise_mul")) {
-      ConvertToFusedOp(op->Op());
-    }
+    ConvertToFusedOp(op->Op());
     op->Op()->SetAttr("mkldnn_data_type", std::string("int8"));
   };
   gpd(graph, handler);

From 833c2f13c1173af99311a25b036b123913778e7f Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Tue, 28 Mar 2023 12:20:15 +0200
Subject: [PATCH 18/23] fix conflicts and delete duplicate fusing

---
 .../framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc  | 1 -
 paddle/fluid/operators/ops_extra_info.h                          | 1 -
 2 files changed, 2 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
index 96da2a4785a682..7726206e84ea41 100644
--- a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -58,7 +58,6 @@ void FuseOperatorUnsqueeze2OneDNNPass::FuseUnsqueeze2(
     GET_IR_NODE_FROM_SUBGRAPH(
         unsqueeze2_out, unsqueeze2_out, op_unsqueeze2_pattern);
 
-    ConvertToFusedOp(operator_op->Op());
     if (!operator_op->Op()->HasAttr("use_mkldnn") ||
         (operator_op->Op()->HasAttr("use_mkldnn") &&
          !(PADDLE_GET_CONST(bool, operator_op->Op()->GetAttr("use_mkldnn"))))) {
diff --git a/paddle/fluid/operators/ops_extra_info.h b/paddle/fluid/operators/ops_extra_info.h
index 852e5abfbf2aab..e58c3f5a80aa66 100644
--- a/paddle/fluid/operators/ops_extra_info.h
+++ b/paddle/fluid/operators/ops_extra_info.h
@@ -107,7 +107,6 @@ const std::unordered_map<std::string, ExtraAttrPropertySet>
         {"Scale_y", ExtraAttrProperty::ONEDNN},
         {"Scale_out", ExtraAttrProperty::ONEDNN},
         {"Scale_weights", ExtraAttrProperty::ONEDNN},
-        {"fused_unsqueeze2_axes", ExtraAttrProperty::ONEDNN},
         {"x_data_format", ExtraAttrProperty::ONEDNN},
         {"y_data_format", ExtraAttrProperty::ONEDNN},
         {"fused_reshape2_shape", ExtraAttrProperty::ONEDNN},

From 2c2093287351d94c41863f5d34c757e3a9d5d28a Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Tue, 11 Apr 2023 18:14:26 +0200
Subject: [PATCH 19/23] Fix code after merge

---
 paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
index 87c64817bdb3ec..d19bca440fbbfd 100644
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -162,7 +162,7 @@ DEFINE_ONEDNN_ELEMENTWISE_KERNEL(FusedDivide, dnnl::algorithm::binary_div)
 PD_REGISTER_KERNEL(fused_elementwise_add,
                    OneDNN,
                    ONEDNN,
-                   phi::FusedAddKernel,
+                   phi::fusion::FusedAddKernel,
                    float,
                    phi::dtype::bfloat16,
                    int8_t,
@@ -171,7 +171,7 @@ PD_REGISTER_KERNEL(fused_elementwise_add,
 PD_REGISTER_KERNEL(fused_elementwise_sub,
                    OneDNN,
                    ONEDNN,
-                   phi::FusedSubtractKernel,
+                   phi::fusion::FusedSubtractKernel,
                    float,
                    phi::dtype::bfloat16,
                    int8_t,

From 4e600b4b8854ca71b9fb7db123865db2226d4420 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Wed, 12 Apr 2023 17:37:24 +0200
Subject: [PATCH 20/23] Move tests to new directory

---
 test/mkldnn/test_elementwise_add_mkldnn_op.py | 210 ------------
 .../mkldnn/test_elementwise_add_onednn_op.py  |   6 +-
 test/mkldnn/test_elementwise_mul_mkldnn_op.py | 210 ------------
 .../mkldnn/test_elementwise_mul_onednn_op.py  |   6 +-
 test/mkldnn/test_elementwise_sub_mkldnn_op.py | 308 ------------------
 .../mkldnn/test_elementwise_sub_onednn_op.py  |   6 +-
 6 files changed, 9 insertions(+), 737 deletions(-)
 delete mode 100644 test/mkldnn/test_elementwise_add_mkldnn_op.py
 rename {python/paddle/fluid/tests/unittests => test}/mkldnn/test_elementwise_add_onednn_op.py (96%)
 delete mode 100644 test/mkldnn/test_elementwise_mul_mkldnn_op.py
 rename {python/paddle/fluid/tests/unittests => test}/mkldnn/test_elementwise_mul_onednn_op.py (96%)
 delete mode 100644 test/mkldnn/test_elementwise_sub_mkldnn_op.py
 rename {python/paddle/fluid/tests/unittests => test}/mkldnn/test_elementwise_sub_onednn_op.py (97%)

diff --git a/test/mkldnn/test_elementwise_add_mkldnn_op.py b/test/mkldnn/test_elementwise_add_mkldnn_op.py
deleted file mode 100644
index 6e190b3183125c..00000000000000
--- a/test/mkldnn/test_elementwise_add_mkldnn_op.py
+++ /dev/null
@@ -1,210 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from paddle import enable_static
-from paddle.fluid.tests.unittests.eager_op_test import skip_check_grad_ci
-from paddle.fluid.tests.unittests.test_elementwise_add_op import (
-    TestElementwiseAddOp,
-)
-
-
-class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-
-class TestMKLDNNElementwiseAddOp2(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOp3(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-    # TODO(jczaja): Enable when grad is ready
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOpBroadcastXintoY(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 50, 1]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [2, 50, 160]).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
-        self.y = np.random.rand(10, 12).astype(self.dtype)
-        self.out = self.x + self.y.reshape(1, 10, 12, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.rand(10, 12).astype(self.dtype)
-        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
-        self.out = self.x + self.y
-
-    def init_axis(self):
-        self.axis = 2
-
-    # TODO(jczaja): Enable when grad is ready
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-
-class TestMKLDNNElementwiseAddOpZeroDim(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOpZeroDim2(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-class TestMKLDNNElementwiseAddOpZeroDim3(TestMKLDNNElementwiseAddOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.add(self.x, self.y)
-
-
-''' INT8 Tests '''
-
-
-@skip_check_grad_ci(
-    reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
-)
-class TestInt8(TestElementwiseAddOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self._cpu_only = True
-
-    def init_dtype(self):
-        self.dtype = np.int8
-
-    def init_input_output(self):
-        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.out = np.add(self.x, self.y)
-
-    def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        self.check_output(check_dygraph=(not self.use_mkldnn))
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestInt8Scales(TestInt8):
-    def quantize(self, tensor, dt="int8"):
-        max_int = 127.0 if dt == "int8" else 255.0
-        scale = max_int / np.abs(np.amax(tensor))
-        quantized = np.round(scale * tensor).astype(dt)
-        return scale, quantized
-
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.add(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f)
-        self.scale_y, self.y = self.quantize(self.y_f)
-        self.scale_o, self.out = self.quantize(self.out_f)
-
-    def init_scales(self):
-        self.attrs['Scale_x'] = self.scale_x
-        self.attrs['Scale_y'] = self.scale_y
-        self.attrs['Scale_out'] = self.scale_o
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        int_atol = 1  # different quantization techniques
-        self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
-
-
-class TestUint8Scales(TestInt8Scales):
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.add(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f, "uint8")
-        self.scale_y, self.y = self.quantize(self.y_f, "uint8")
-        self.scale_o, self.out = self.quantize(self.out_f, "uint8")
-
-    def init_dtype(self):
-        self.dtype = np.uint8
-
-
-if __name__ == '__main__':
-    enable_static()
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py b/test/mkldnn/test_elementwise_add_onednn_op.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
rename to test/mkldnn/test_elementwise_add_onednn_op.py
index 1ce623258530e7..0ad1c40021d508 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_add_onednn_op.py
+++ b/test/mkldnn/test_elementwise_add_onednn_op.py
@@ -103,21 +103,21 @@ def test_check_grad_ingore_x(self):
         pass
 
 
-class TestMKLDNNElementwiseAddOpZeroDim(TestOneDNNElementwiseAddOp):
+class TestOneDNNlementwiseAddOpZeroDim(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOpZeroDim2(TestOneDNNElementwiseAddOp):
+class TestOneDNNlementwiseAddOpZeroDim2(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
         self.out = np.add(self.x, self.y)
 
 
-class TestMKLDNNElementwiseAddOpZeroDim3(TestOneDNNElementwiseAddOp):
+class TestOneDNNlementwiseAddOpZeroDim3(TestOneDNNElementwiseAddOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
diff --git a/test/mkldnn/test_elementwise_mul_mkldnn_op.py b/test/mkldnn/test_elementwise_mul_mkldnn_op.py
deleted file mode 100644
index 8d60282abcbdd0..00000000000000
--- a/test/mkldnn/test_elementwise_mul_mkldnn_op.py
+++ /dev/null
@@ -1,210 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from paddle import enable_static
-from paddle.fluid.tests.unittests.eager_op_test import skip_check_grad_ci
-from paddle.fluid.tests.unittests.test_elementwise_mul_op import (
-    ElementwiseMulOp,
-)
-
-
-class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-
-class TestMKLDNNElementwiseMulOp2(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-
-class TestMKLDNNElementwiseMulOp3(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-
-class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    # TODO(jczaja): Enable when grad is ready
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestMKLDNNElementwiseMulOp5(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    # TODO(jczaja): Enable when grad is ready
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-
-class TestMKLDNNElementwiseMulOpZeroDim(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-
-class TestMKLDNNElementwiseMulOpZeroDim2(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-
-class TestMKLDNNElementwiseMulOpZeroDim3(TestMKLDNNElementwiseMulOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.multiply(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-
-''' INT8 Tests '''
-
-
-@skip_check_grad_ci(
-    reason="oneDNN's int8 elementwise_ops don't implemend grad kernel."
-)
-class TestInt8(ElementwiseMulOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self._cpu_only = True
-
-    def init_dtype(self):
-        self.dtype = np.int8
-
-    def init_input_output(self):
-        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.out = np.multiply(self.x, self.y)
-
-    def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        self.check_output(check_dygraph=(not self.use_mkldnn))
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ingore_x(self):
-        pass
-
-    def test_check_grad_ingore_y(self):
-        pass
-
-
-class TestInt8Scales(TestInt8):
-    def quantize(self, tensor, dt="int8"):
-        max_int = 127.0 if dt == "int8" else 255.0
-        scale = max_int / np.abs(np.amax(tensor))
-        quantized = np.round(scale * tensor).astype(dt)
-        return scale, quantized
-
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.multiply(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f)
-        self.scale_y, self.y = self.quantize(self.y_f)
-        self.scale_o, self.out = self.quantize(self.out_f)
-
-    def init_scales(self):
-        self.attrs['Scale_x'] = self.scale_x
-        self.attrs['Scale_y'] = self.scale_y
-        self.attrs['Scale_out'] = self.scale_o
-
-    def test_check_output(self):
-        # TODO(wangzhongpu): support mkldnn op in dygraph mode
-        self.init_scales()
-        int_atol = 1  # different quantization techniques
-        self.check_output(check_dygraph=(not self.use_mkldnn), atol=int_atol)
-
-
-class TestUint8Scales(TestInt8Scales):
-    def init_input_output(self):
-        self.x_f = np.random.random((100,)).astype("float")
-        self.y_f = np.random.random((100,)).astype("float")
-        self.out_f = np.multiply(self.x_f, self.y_f)
-
-        self.scale_x, self.x = self.quantize(self.x_f, "uint8")
-        self.scale_y, self.y = self.quantize(self.y_f, "uint8")
-        self.scale_o, self.out = self.quantize(self.out_f, "uint8")
-
-    def init_dtype(self):
-        self.dtype = np.uint8
-
-
-if __name__ == '__main__':
-    enable_static()
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py b/test/mkldnn/test_elementwise_mul_onednn_op.py
similarity index 96%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
rename to test/mkldnn/test_elementwise_mul_onednn_op.py
index b3f62f14b76568..09ba96995d3f57 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_mul_onednn_op.py
+++ b/test/mkldnn/test_elementwise_mul_onednn_op.py
@@ -76,7 +76,7 @@ def test_check_grad_ingore_x(self):
         pass
 
 
-class TestMKLDNNElementwiseMulOpZeroDim(TestOneDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOpZeroDim(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
@@ -92,7 +92,7 @@ def test_check_grad_ingore_x(self):
         pass
 
 
-class TestMKLDNNElementwiseMulOpZeroDim2(TestOneDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOpZeroDim2(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
@@ -108,7 +108,7 @@ def test_check_grad_ingore_x(self):
         pass
 
 
-class TestMKLDNNElementwiseMulOpZeroDim3(TestOneDNNElementwiseMulOp):
+class TestOneDNNElementwiseMulOpZeroDim3(TestOneDNNElementwiseMulOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
diff --git a/test/mkldnn/test_elementwise_sub_mkldnn_op.py b/test/mkldnn/test_elementwise_sub_mkldnn_op.py
deleted file mode 100644
index 2f74c0da3180a6..00000000000000
--- a/test/mkldnn/test_elementwise_sub_mkldnn_op.py
+++ /dev/null
@@ -1,308 +0,0 @@
-#  Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import numpy as np
-
-from paddle import enable_static
-from paddle.fluid import core
-from paddle.fluid.framework import _current_expected_place
-from paddle.fluid.tests.unittests.eager_op_test import (
-    OpTest,
-    OpTestTool,
-    convert_float_to_uint16,
-)
-
-
-@OpTestTool.skip_if(
-    not (isinstance(_current_expected_place(), core.CPUPlace)),
-    "GPU is not supported",
-)
-class TestMKLDNNElementwiseSubOp(OpTest):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-        self.inputs = {
-            'X': OpTest.np_dtype_to_fluid_dtype(self.x),
-            'Y': OpTest.np_dtype_to_fluid_dtype(self.y),
-        }
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': self.out}
-
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out')
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
-
-    def init_axis(self):
-        self.axis = -1
-
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_output(self):
-        self.check_output()
-
-
-class TestMKLDNNElementwiseSubOp2(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp3(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp6(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp7(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.uniform(0.1, 2, [1, 180]).astype(self.dtype)
-        self.y = np.random.uniform(0.1, 1, [256, 1]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-
-class TestMKLDNNElementwiseSubOp_broadcast(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
-        self.y = np.random.rand(10, 12).astype(self.dtype)
-        self.out = self.x - self.y.reshape(1, 10, 12, 1)
-
-    def init_axis(self):
-        self.axis = 1
-
-
-class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.rand(10, 12).astype(self.dtype)
-        self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype)
-        self.out = self.x - self.y
-
-    def init_axis(self):
-        self.axis = 2
-
-
-class TestMKLDNNElementwiseSubOpZeroDim(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.random.random((100,)).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ignore_x(self):
-        pass
-
-    def test_check_grad_ignore_y(self):
-        pass
-
-
-class TestMKLDNNElementwiseSubOpZeroDim2(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.random.random((100,)).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ignore_x(self):
-        pass
-
-    def test_check_grad_ignore_y(self):
-        pass
-
-
-class TestMKLDNNElementwiseSubOpZeroDim3(TestMKLDNNElementwiseSubOp):
-    def init_input_output(self):
-        self.x = np.array(3.0).astype(self.dtype)
-        self.y = np.array(3.0).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ignore_x(self):
-        pass
-
-    def test_check_grad_ignore_y(self):
-        pass
-
-
-@OpTestTool.skip_if_not_cpu_bf16()
-class TestBf16(TestMKLDNNElementwiseSubOp):
-    def setUp(self):
-        self.op_type = "elementwise_sub"
-        self.init_dtype()
-        self.init_input_output()
-        self.init_kernel_type()
-        self.init_axis()
-
-        self.x_bf16 = convert_float_to_uint16(self.x)
-        self.y_bf16 = convert_float_to_uint16(self.y)
-        self.inputs = {'X': self.x_bf16, 'Y': self.y_bf16}
-        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
-        self.outputs = {'Out': convert_float_to_uint16(self.out)}
-
-    def init_dtype(self):
-        self.dtype = np.float32
-        self.mkldnn_data_type = "bfloat16"
-
-    def init_input_output(self):
-        self.x = np.random.random(
-            100,
-        ).astype(self.dtype)
-        self.y = np.random.random(
-            100,
-        ).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def test_check_output(self):
-        self.check_output_with_place(core.CPUPlace())
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(),
-            ["X", "Y"],
-            "Out",
-            user_defined_grads=[self.x, -self.x],
-            user_defined_grad_outputs=[self.x_bf16],
-        )
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad_with_place(
-            core.CPUPlace(),
-            ["Y"],
-            "Out",
-            user_defined_grads=[-self.y],
-            user_defined_grad_outputs=[self.y_bf16],
-        )
-
-    def test_check_grad_ignore_y(self):
-        self.check_grad_with_place(
-            core.CPUPlace(),
-            ["X"],
-            "Out",
-            user_defined_grads=[self.x],
-            user_defined_grad_outputs=[self.x_bf16],
-        )
-
-
-class TestBf16Broadcasting(TestBf16):
-    def init_input_output(self):
-        self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
-        self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
-        self.out = np.subtract(self.x, self.y)
-
-    def compute_reduced_gradients(self, out_grads):
-        part_sum = np.add.reduceat(out_grads, [0], axis=0)
-        part_sum = np.add.reduceat(part_sum, [0], axis=1)
-        part_sum = np.add.reduceat(part_sum, [0], axis=2)
-        return -part_sum.flatten()
-
-    def test_check_grad_normal(self):
-        self.check_grad_with_place(
-            core.CPUPlace(),
-            ["X", "Y"],
-            "Out",
-            user_defined_grads=[self.x, self.compute_reduced_gradients(self.x)],
-            user_defined_grad_outputs=[self.x_bf16],
-        )
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad_with_place(
-            core.CPUPlace(),
-            ["Y"],
-            "Out",
-            user_defined_grads=[self.compute_reduced_gradients(self.x)],
-            user_defined_grad_outputs=[self.x_bf16],
-        )
-
-
-class TestInt8(TestMKLDNNElementwiseSubOp):
-    def init_kernel_type(self):
-        self.use_mkldnn = True
-        self._cpu_only = True
-
-    def init_dtype(self):
-        self.dtype = np.int8
-
-    def init_input_output(self):
-        self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
-        self.out = np.subtract(self.x, self.y)
-
-    def init_scales(self):
-        self.attrs['Scale_x'] = 1.0
-        self.attrs['Scale_y'] = 1.0
-        self.attrs['Scale_out'] = 1.0
-
-    def test_check_output(self):
-        self.init_scales()
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        pass
-
-    def test_check_grad_ignore_x(self):
-        pass
-
-    def test_check_grad_ignore_y(self):
-        pass
-
-
-if __name__ == '__main__':
-    enable_static()
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py b/test/mkldnn/test_elementwise_sub_onednn_op.py
similarity index 97%
rename from python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
rename to test/mkldnn/test_elementwise_sub_onednn_op.py
index bdc2d4a6d10bb5..35dac7b091ed01 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_onednn_op.py
+++ b/test/mkldnn/test_elementwise_sub_onednn_op.py
@@ -133,7 +133,7 @@ def init_axis(self):
         self.axis = 2
 
 
-class TestMKLDNNElementwiseSubOpZeroDim(TestOneDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOpZeroDim(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.random.random((100,)).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)
@@ -149,7 +149,7 @@ def test_check_grad_ignore_y(self):
         pass
 
 
-class TestMKLDNNElementwiseSubOpZeroDim2(TestOneDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOpZeroDim2(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.random.random((100,)).astype(self.dtype)
@@ -165,7 +165,7 @@ def test_check_grad_ignore_y(self):
         pass
 
 
-class TestMKLDNNElementwiseSubOpZeroDim3(TestOneDNNElementwiseSubOp):
+class TestOneDNNElementwiseSubOpZeroDim3(TestOneDNNElementwiseSubOp):
     def init_input_output(self):
         self.x = np.array(3.0).astype(self.dtype)
         self.y = np.array(3.0).astype(self.dtype)

From 8d601609421b9b1848e14203a33cf21a0891b570 Mon Sep 17 00:00:00 2001
From: JakubHulek <jakub.hulek@intel.com>
Date: Mon, 24 Apr 2023 11:57:43 +0200
Subject: [PATCH 21/23] fix tests volatility

---
 test/mkldnn/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt
index d3da5f2897da39..3740c3f618a65a 100644
--- a/test/mkldnn/CMakeLists.txt
+++ b/test/mkldnn/CMakeLists.txt
@@ -24,6 +24,8 @@ endif()
 
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_mul_onednn_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_elementwise_add_onednn_op PROPERTIES TIMEOUT 60)
 if(WITH_MKLDNN AND NOT WIN32)
   set_tests_properties(test_onnx_format_quantization_mobilenetv1
                        PROPERTIES TIMEOUT 300)

From 7d2fa1a1f8d14ad39dbb4c524b64e23174a385a4 Mon Sep 17 00:00:00 2001
From: Hulek <jakub.hulek01@gmail.com>
Date: Tue, 16 May 2023 12:58:34 +0200
Subject: [PATCH 22/23] Rename test_elementwise_add_onednn_op.py to
 test_elementwise_add_mkldnn_op.py

---
 ...entwise_add_onednn_op.py => test_elementwise_add_mkldnn_op.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename test/mkldnn/{test_elementwise_add_onednn_op.py => test_elementwise_add_mkldnn_op.py} (100%)

diff --git a/test/mkldnn/test_elementwise_add_onednn_op.py b/test/mkldnn/test_elementwise_add_mkldnn_op.py
similarity index 100%
rename from test/mkldnn/test_elementwise_add_onednn_op.py
rename to test/mkldnn/test_elementwise_add_mkldnn_op.py

From 20c3c3335781fd863db220377cdee39064a7cd34 Mon Sep 17 00:00:00 2001
From: Hulek <jakub.hulek01@gmail.com>
Date: Tue, 16 May 2023 14:13:51 +0200
Subject: [PATCH 23/23] Update CMakeLists.txt add mkldnn op test

---
 test/mkldnn/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt
index 3740c3f618a65a..16030200222e83 100644
--- a/test/mkldnn/CMakeLists.txt
+++ b/test/mkldnn/CMakeLists.txt
@@ -25,7 +25,7 @@ endif()
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_elementwise_mul_onednn_op PROPERTIES TIMEOUT 60)
-set_tests_properties(test_elementwise_add_onednn_op PROPERTIES TIMEOUT 60)
+set_tests_properties(test_elementwise_add_mkldnn_op PROPERTIES TIMEOUT 60)
 if(WITH_MKLDNN AND NOT WIN32)
   set_tests_properties(test_onnx_format_quantization_mobilenetv1
                        PROPERTIES TIMEOUT 300)